annotate ext/serd/src/reader.c @ 226:c5cdc9e6a4bf

Add these external library files
author Chris Cannam <cannam@all-day-breakfast.com>
date Fri, 09 Jun 2017 16:41:31 +0100
parents
children
rev   line source
cannam@226 1 /*
cannam@226 2 Copyright 2011-2017 David Robillard <http://drobilla.net>
cannam@226 3
cannam@226 4 Permission to use, copy, modify, and/or distribute this software for any
cannam@226 5 purpose with or without fee is hereby granted, provided that the above
cannam@226 6 copyright notice and this permission notice appear in all copies.
cannam@226 7
cannam@226 8 THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
cannam@226 9 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
cannam@226 10 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
cannam@226 11 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
cannam@226 12 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
cannam@226 13 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
cannam@226 14 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
cannam@226 15 */
cannam@226 16
cannam@226 17 #include "serd_internal.h"
cannam@226 18
cannam@226 19 #include <assert.h>
cannam@226 20 #include <ctype.h>
cannam@226 21 #include <errno.h>
cannam@226 22 #include <stdarg.h>
cannam@226 23 #include <stdint.h>
cannam@226 24 #include <stdio.h>
cannam@226 25 #include <stdlib.h>
cannam@226 26 #include <string.h>
cannam@226 27
cannam@226 28 #define NS_XSD "http://www.w3.org/2001/XMLSchema#"
cannam@226 29 #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
cannam@226 30
cannam@226 31 #define TRY_THROW(exp) if (!(exp)) goto except;
cannam@226 32 #define TRY_RET(exp) if (!(exp)) return 0;
cannam@226 33
cannam@226 34 #ifdef SERD_STACK_CHECK
cannam@226 35 # define SERD_STACK_ASSERT_TOP(reader, ref) \
cannam@226 36 assert(ref == reader->allocs[reader->n_allocs - 1]);
cannam@226 37 #else
cannam@226 38 # define SERD_STACK_ASSERT_TOP(reader, ref)
cannam@226 39 #endif
cannam@226 40
cannam@226 41 typedef struct {
cannam@226 42 const uint8_t* filename;
cannam@226 43 unsigned line;
cannam@226 44 unsigned col;
cannam@226 45 } Cursor;
cannam@226 46
cannam@226 47 typedef uint32_t uchar;
cannam@226 48
cannam@226 49 /* Reference to a node in the stack (we can not use pointers since the
cannam@226 50 stack may be reallocated, invalidating any pointers to elements).
cannam@226 51 */
cannam@226 52 typedef size_t Ref;
cannam@226 53
cannam@226 54 typedef struct {
cannam@226 55 Ref graph;
cannam@226 56 Ref subject;
cannam@226 57 Ref predicate;
cannam@226 58 Ref object;
cannam@226 59 Ref datatype;
cannam@226 60 Ref lang;
cannam@226 61 SerdStatementFlags* flags;
cannam@226 62 } ReadContext;
cannam@226 63
cannam@226 64 struct SerdReaderImpl {
cannam@226 65 void* handle;
cannam@226 66 void (*free_handle)(void* ptr);
cannam@226 67 SerdBaseSink base_sink;
cannam@226 68 SerdPrefixSink prefix_sink;
cannam@226 69 SerdStatementSink statement_sink;
cannam@226 70 SerdEndSink end_sink;
cannam@226 71 SerdErrorSink error_sink;
cannam@226 72 void* error_handle;
cannam@226 73 Ref rdf_first;
cannam@226 74 Ref rdf_rest;
cannam@226 75 Ref rdf_nil;
cannam@226 76 SerdNode default_graph;
cannam@226 77 SerdByteSource source;
cannam@226 78 SerdStack stack;
cannam@226 79 SerdSyntax syntax;
cannam@226 80 unsigned next_id;
cannam@226 81 Cursor cur;
cannam@226 82 SerdStatus status;
cannam@226 83 uint8_t* buf;
cannam@226 84 uint8_t* bprefix;
cannam@226 85 size_t bprefix_len;
cannam@226 86 bool strict; ///< True iff strict parsing
cannam@226 87 bool eof;
cannam@226 88 bool seen_genid;
cannam@226 89 #ifdef SERD_STACK_CHECK
cannam@226 90 Ref* allocs; ///< Stack of push offsets
cannam@226 91 size_t n_allocs; ///< Number of stack pushes
cannam@226 92 #endif
cannam@226 93 };
cannam@226 94
cannam@226 95 static inline bool
cannam@226 96 supports_fancy_literals(const SerdReader* reader)
cannam@226 97 {
cannam@226 98 return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG;
cannam@226 99 }
cannam@226 100
cannam@226 101 static inline bool
cannam@226 102 supports_relative_iris(const SerdReader* reader)
cannam@226 103 {
cannam@226 104 return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG;
cannam@226 105 }
cannam@226 106
cannam@226 107 static int
cannam@226 108 r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...)
cannam@226 109 {
cannam@226 110 va_list args;
cannam@226 111 va_start(args, fmt);
cannam@226 112 const SerdError e = {
cannam@226 113 st, reader->cur.filename, reader->cur.line, reader->cur.col, fmt, &args
cannam@226 114 };
cannam@226 115 serd_error(reader->error_sink, reader->error_handle, &e);
cannam@226 116 va_end(args);
cannam@226 117 return 0;
cannam@226 118 }
cannam@226 119
cannam@226 120 /** fread-like wrapper for getc (which is faster). */
cannam@226 121 static size_t
cannam@226 122 serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream)
cannam@226 123 {
cannam@226 124 const int c = getc((FILE*)stream);
cannam@226 125 if (c == EOF) {
cannam@226 126 *((uint8_t*)buf) = 0;
cannam@226 127 return 0;
cannam@226 128 }
cannam@226 129 *((uint8_t*)buf) = (uint8_t)c;
cannam@226 130 return 1;
cannam@226 131 }
cannam@226 132
cannam@226 133 static inline uint8_t
cannam@226 134 peek_byte(SerdReader* reader)
cannam@226 135 {
cannam@226 136 return serd_byte_source_peek(&reader->source);
cannam@226 137 }
cannam@226 138
cannam@226 139 static inline uint8_t
cannam@226 140 eat_byte_safe(SerdReader* reader, const uint8_t byte)
cannam@226 141 {
cannam@226 142 assert(peek_byte(reader) == byte);
cannam@226 143 switch (byte) {
cannam@226 144 case '\0': reader->eof = (byte != '\0'); break;
cannam@226 145 case '\n': ++reader->cur.line; reader->cur.col = 0; break;
cannam@226 146 default: ++reader->cur.col;
cannam@226 147 }
cannam@226 148
cannam@226 149 reader->status = serd_byte_source_advance(&reader->source);
cannam@226 150 return byte;
cannam@226 151 }
cannam@226 152
cannam@226 153 static inline uint8_t
cannam@226 154 eat_byte_check(SerdReader* reader, const uint8_t byte)
cannam@226 155 {
cannam@226 156 const uint8_t c = peek_byte(reader);
cannam@226 157 if (c != byte) {
cannam@226 158 return r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 159 "expected `%c', not `%c'\n", byte, c);
cannam@226 160 }
cannam@226 161 return eat_byte_safe(reader, byte);
cannam@226 162 }
cannam@226 163
cannam@226 164 static inline bool
cannam@226 165 eat_string(SerdReader* reader, const char* str, unsigned n)
cannam@226 166 {
cannam@226 167 bool bad = false;
cannam@226 168 for (unsigned i = 0; i < n; ++i) {
cannam@226 169 bad |= eat_byte_check(reader, ((const uint8_t*)str)[i]);
cannam@226 170 }
cannam@226 171 return bad;
cannam@226 172 }
cannam@226 173
cannam@226 174 static Ref
cannam@226 175 push_node_padded(SerdReader* reader, size_t maxlen,
cannam@226 176 SerdType type, const char* str, size_t n_bytes)
cannam@226 177 {
cannam@226 178 void* mem = serd_stack_push_aligned(
cannam@226 179 &reader->stack, sizeof(SerdNode) + maxlen + 1, sizeof(SerdNode));
cannam@226 180
cannam@226 181 SerdNode* const node = (SerdNode*)mem;
cannam@226 182 node->n_bytes = node->n_chars = n_bytes;
cannam@226 183 node->flags = 0;
cannam@226 184 node->type = type;
cannam@226 185 node->buf = NULL;
cannam@226 186
cannam@226 187 uint8_t* buf = (uint8_t*)(node + 1);
cannam@226 188 memcpy(buf, str, n_bytes + 1);
cannam@226 189
cannam@226 190 #ifdef SERD_STACK_CHECK
cannam@226 191 reader->allocs = realloc(
cannam@226 192 reader->allocs, sizeof(uint8_t*) * (++reader->n_allocs));
cannam@226 193 reader->allocs[reader->n_allocs - 1] = ((uint8_t*)mem - reader->stack.buf);
cannam@226 194 #endif
cannam@226 195 return (uint8_t*)node - reader->stack.buf;
cannam@226 196 }
cannam@226 197
cannam@226 198 static Ref
cannam@226 199 push_node(SerdReader* reader, SerdType type, const char* str, size_t n_bytes)
cannam@226 200 {
cannam@226 201 return push_node_padded(reader, n_bytes, type, str, n_bytes);
cannam@226 202 }
cannam@226 203
cannam@226 204 static inline SerdNode*
cannam@226 205 deref(SerdReader* reader, const Ref ref)
cannam@226 206 {
cannam@226 207 if (ref) {
cannam@226 208 SerdNode* node = (SerdNode*)(reader->stack.buf + ref);
cannam@226 209 node->buf = (uint8_t*)node + sizeof(SerdNode);
cannam@226 210 return node;
cannam@226 211 }
cannam@226 212 return NULL;
cannam@226 213 }
cannam@226 214
cannam@226 215 static inline void
cannam@226 216 push_byte(SerdReader* reader, Ref ref, const uint8_t c)
cannam@226 217 {
cannam@226 218 SERD_STACK_ASSERT_TOP(reader, ref);
cannam@226 219 uint8_t* const s = serd_stack_push(&reader->stack, 1);
cannam@226 220 SerdNode* const node = (SerdNode*)(reader->stack.buf + ref);
cannam@226 221 ++node->n_bytes;
cannam@226 222 if (!(c & 0x80)) { // Starts with 0 bit, start of new character
cannam@226 223 ++node->n_chars;
cannam@226 224 }
cannam@226 225 *(s - 1) = c;
cannam@226 226 *s = '\0';
cannam@226 227 }
cannam@226 228
cannam@226 229 static inline void
cannam@226 230 push_replacement(SerdReader* reader, Ref dest)
cannam@226 231 {
cannam@226 232 push_byte(reader, dest, 0xEF);
cannam@226 233 push_byte(reader, dest, 0xBF);
cannam@226 234 push_byte(reader, dest, 0xBD);
cannam@226 235 }
cannam@226 236
cannam@226 237 static Ref
cannam@226 238 pop_node(SerdReader* reader, Ref ref)
cannam@226 239 {
cannam@226 240 if (ref && ref != reader->rdf_first && ref != reader->rdf_rest
cannam@226 241 && ref != reader->rdf_nil) {
cannam@226 242 #ifdef SERD_STACK_CHECK
cannam@226 243 SERD_STACK_ASSERT_TOP(reader, ref);
cannam@226 244 --reader->n_allocs;
cannam@226 245 #endif
cannam@226 246 SerdNode* const node = deref(reader, ref);
cannam@226 247 uint8_t* const top = reader->stack.buf + reader->stack.size;
cannam@226 248 serd_stack_pop_aligned(&reader->stack, top - (uint8_t*)node);
cannam@226 249 }
cannam@226 250 return 0;
cannam@226 251 }
cannam@226 252
cannam@226 253 static inline bool
cannam@226 254 emit_statement(SerdReader* reader, ReadContext ctx, Ref o, Ref d, Ref l)
cannam@226 255 {
cannam@226 256 SerdNode* graph = deref(reader, ctx.graph);
cannam@226 257 if (!graph && reader->default_graph.buf) {
cannam@226 258 graph = &reader->default_graph;
cannam@226 259 }
cannam@226 260 bool ret = !reader->statement_sink ||
cannam@226 261 !reader->statement_sink(
cannam@226 262 reader->handle, *ctx.flags, graph,
cannam@226 263 deref(reader, ctx.subject), deref(reader, ctx.predicate),
cannam@226 264 deref(reader, o), deref(reader, d), deref(reader, l));
cannam@226 265 *ctx.flags &= SERD_ANON_CONT|SERD_LIST_CONT; // Preserve only cont flags
cannam@226 266 return ret;
cannam@226 267 }
cannam@226 268
cannam@226 269 static bool
cannam@226 270 read_collection(SerdReader* reader, ReadContext ctx, Ref* dest);
cannam@226 271
cannam@226 272 static bool
cannam@226 273 read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot);
cannam@226 274
cannam@226 275 static inline uint8_t
cannam@226 276 read_HEX(SerdReader* reader)
cannam@226 277 {
cannam@226 278 const uint8_t c = peek_byte(reader);
cannam@226 279 if (is_digit(c) || in_range(c, 'A', 'F') || in_range(c, 'a', 'f')) {
cannam@226 280 return eat_byte_safe(reader, c);
cannam@226 281 } else {
cannam@226 282 return r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 283 "invalid hexadecimal digit `%c'\n", c);
cannam@226 284 }
cannam@226 285 }
cannam@226 286
cannam@226 287 // Read UCHAR escape, initial \ is already eaten by caller
cannam@226 288 static inline bool
cannam@226 289 read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code)
cannam@226 290 {
cannam@226 291 const uint8_t b = peek_byte(reader);
cannam@226 292 unsigned length = 0;
cannam@226 293 switch (b) {
cannam@226 294 case 'U':
cannam@226 295 length = 8;
cannam@226 296 break;
cannam@226 297 case 'u':
cannam@226 298 length = 4;
cannam@226 299 break;
cannam@226 300 default:
cannam@226 301 return false;
cannam@226 302 }
cannam@226 303 eat_byte_safe(reader, b);
cannam@226 304
cannam@226 305 uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
cannam@226 306 for (unsigned i = 0; i < length; ++i) {
cannam@226 307 if (!(buf[i] = read_HEX(reader))) {
cannam@226 308 return false;
cannam@226 309 }
cannam@226 310 }
cannam@226 311
cannam@226 312 uint32_t code;
cannam@226 313 sscanf((const char*)buf, "%X", &code);
cannam@226 314
cannam@226 315 unsigned size = 0;
cannam@226 316 if (code < 0x00000080) {
cannam@226 317 size = 1;
cannam@226 318 } else if (code < 0x00000800) {
cannam@226 319 size = 2;
cannam@226 320 } else if (code < 0x00010000) {
cannam@226 321 size = 3;
cannam@226 322 } else if (code < 0x00110000) {
cannam@226 323 size = 4;
cannam@226 324 } else {
cannam@226 325 r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 326 "unicode character 0x%X out of range\n", code);
cannam@226 327 push_replacement(reader, dest);
cannam@226 328 *char_code = 0xFFFD;
cannam@226 329 return true;
cannam@226 330 }
cannam@226 331
cannam@226 332 // Build output in buf
cannam@226 333 // (Note # of bytes = # of leading 1 bits in first byte)
cannam@226 334 uint32_t c = code;
cannam@226 335 switch (size) {
cannam@226 336 case 4:
cannam@226 337 buf[3] = 0x80 | (uint8_t)(c & 0x3F);
cannam@226 338 c >>= 6;
cannam@226 339 c |= (16 << 12); // set bit 4
cannam@226 340 case 3:
cannam@226 341 buf[2] = 0x80 | (uint8_t)(c & 0x3F);
cannam@226 342 c >>= 6;
cannam@226 343 c |= (32 << 6); // set bit 5
cannam@226 344 case 2:
cannam@226 345 buf[1] = 0x80 | (uint8_t)(c & 0x3F);
cannam@226 346 c >>= 6;
cannam@226 347 c |= 0xC0; // set bits 6 and 7
cannam@226 348 case 1:
cannam@226 349 buf[0] = (uint8_t)c;
cannam@226 350 }
cannam@226 351
cannam@226 352 for (unsigned i = 0; i < size; ++i) {
cannam@226 353 push_byte(reader, dest, buf[i]);
cannam@226 354 }
cannam@226 355 *char_code = code;
cannam@226 356 return true;
cannam@226 357 }
cannam@226 358
cannam@226 359 // Read ECHAR escape, initial \ is already eaten by caller
cannam@226 360 static inline bool
cannam@226 361 read_ECHAR(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
cannam@226 362 {
cannam@226 363 const uint8_t c = peek_byte(reader);
cannam@226 364 switch (c) {
cannam@226 365 case 't':
cannam@226 366 eat_byte_safe(reader, 't');
cannam@226 367 push_byte(reader, dest, '\t');
cannam@226 368 return true;
cannam@226 369 case 'b':
cannam@226 370 eat_byte_safe(reader, 'b');
cannam@226 371 push_byte(reader, dest, '\b');
cannam@226 372 return true;
cannam@226 373 case 'n':
cannam@226 374 *flags |= SERD_HAS_NEWLINE;
cannam@226 375 eat_byte_safe(reader, 'n');
cannam@226 376 push_byte(reader, dest, '\n');
cannam@226 377 return true;
cannam@226 378 case 'r':
cannam@226 379 *flags |= SERD_HAS_NEWLINE;
cannam@226 380 eat_byte_safe(reader, 'r');
cannam@226 381 push_byte(reader, dest, '\r');
cannam@226 382 return true;
cannam@226 383 case 'f':
cannam@226 384 eat_byte_safe(reader, 'f');
cannam@226 385 push_byte(reader, dest, '\f');
cannam@226 386 return true;
cannam@226 387 case '\\': case '"': case '\'':
cannam@226 388 push_byte(reader, dest, eat_byte_safe(reader, c));
cannam@226 389 return true;
cannam@226 390 default:
cannam@226 391 return false;
cannam@226 392 }
cannam@226 393 }
cannam@226 394
cannam@226 395 static inline SerdStatus
cannam@226 396 bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c)
cannam@226 397 {
cannam@226 398 r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c);
cannam@226 399 push_replacement(reader, dest);
cannam@226 400
cannam@226 401 // Skip bytes until the next start byte
cannam@226 402 for (uint8_t b = peek_byte(reader); (b & 0x80);) {
cannam@226 403 eat_byte_safe(reader, b);
cannam@226 404 b = peek_byte(reader);
cannam@226 405 }
cannam@226 406
cannam@226 407 return SERD_SUCCESS;
cannam@226 408 }
cannam@226 409
cannam@226 410 static SerdStatus
cannam@226 411 read_utf8_character(SerdReader* reader, Ref dest, uint8_t c)
cannam@226 412 {
cannam@226 413 unsigned size = 1;
cannam@226 414 if ((c & 0xE0) == 0xC0) { // Starts with `110'
cannam@226 415 size = 2;
cannam@226 416 } else if ((c & 0xF0) == 0xE0) { // Starts with `1110'
cannam@226 417 size = 3;
cannam@226 418 } else if ((c & 0xF8) == 0xF0) { // Starts with `11110'
cannam@226 419 size = 4;
cannam@226 420 } else {
cannam@226 421 return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", c);
cannam@226 422 }
cannam@226 423
cannam@226 424 char bytes[4];
cannam@226 425 bytes[0] = c;
cannam@226 426
cannam@226 427 // Check character validity
cannam@226 428 for (unsigned i = 1; i < size; ++i) {
cannam@226 429 if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) {
cannam@226 430 return bad_char(reader, dest, "invalid UTF-8 continuation 0x%X\n",
cannam@226 431 bytes[i]);
cannam@226 432 }
cannam@226 433 eat_byte_safe(reader, bytes[i]);
cannam@226 434 }
cannam@226 435
cannam@226 436 // Emit character
cannam@226 437 for (unsigned i = 0; i < size; ++i) {
cannam@226 438 push_byte(reader, dest, bytes[i]);
cannam@226 439 }
cannam@226 440 return SERD_SUCCESS;
cannam@226 441 }
cannam@226 442
cannam@226 443 // Read one character (possibly multi-byte)
cannam@226 444 // The first byte, c, has already been eaten by caller
cannam@226 445 static inline SerdStatus
cannam@226 446 read_character(SerdReader* reader, Ref dest, SerdNodeFlags* flags, uint8_t c)
cannam@226 447 {
cannam@226 448 if (!(c & 0x80)) {
cannam@226 449 switch (c) {
cannam@226 450 case 0xA: case 0xD:
cannam@226 451 *flags |= SERD_HAS_NEWLINE;
cannam@226 452 break;
cannam@226 453 case '"': case '\'':
cannam@226 454 *flags |= SERD_HAS_QUOTE;
cannam@226 455 break;
cannam@226 456 }
cannam@226 457 push_byte(reader, dest, c);
cannam@226 458 return SERD_SUCCESS;
cannam@226 459 } else {
cannam@226 460 return read_utf8_character(reader, dest, c);
cannam@226 461 }
cannam@226 462 }
cannam@226 463
cannam@226 464 // [10] comment ::= '#' ( [^#xA #xD] )*
cannam@226 465 static void
cannam@226 466 read_comment(SerdReader* reader)
cannam@226 467 {
cannam@226 468 eat_byte_safe(reader, '#');
cannam@226 469 uint8_t c;
cannam@226 470 while (((c = peek_byte(reader)) != 0xA) && (c != 0xD) && c) {
cannam@226 471 eat_byte_safe(reader, c);
cannam@226 472 }
cannam@226 473 }
cannam@226 474
cannam@226 475 // [24] ws ::= #x9 | #xA | #xD | #x20 | comment
cannam@226 476 static inline bool
cannam@226 477 read_ws(SerdReader* reader)
cannam@226 478 {
cannam@226 479 const uint8_t c = peek_byte(reader);
cannam@226 480 switch (c) {
cannam@226 481 case 0x9: case 0xA: case 0xD: case 0x20:
cannam@226 482 eat_byte_safe(reader, c);
cannam@226 483 return true;
cannam@226 484 case '#':
cannam@226 485 read_comment(reader);
cannam@226 486 return true;
cannam@226 487 default:
cannam@226 488 return false;
cannam@226 489 }
cannam@226 490 }
cannam@226 491
cannam@226 492 static inline bool
cannam@226 493 read_ws_star(SerdReader* reader)
cannam@226 494 {
cannam@226 495 while (read_ws(reader)) {}
cannam@226 496 return true;
cannam@226 497 }
cannam@226 498
cannam@226 499 static inline bool
cannam@226 500 peek_delim(SerdReader* reader, const char delim)
cannam@226 501 {
cannam@226 502 read_ws_star(reader);
cannam@226 503 return peek_byte(reader) == delim;
cannam@226 504 }
cannam@226 505
cannam@226 506 static inline bool
cannam@226 507 eat_delim(SerdReader* reader, const char delim)
cannam@226 508 {
cannam@226 509 if (peek_delim(reader, delim)) {
cannam@226 510 eat_byte_safe(reader, delim);
cannam@226 511 return read_ws_star(reader);
cannam@226 512 }
cannam@226 513 return false;
cannam@226 514 }
cannam@226 515
cannam@226 516 // STRING_LITERAL_LONG_QUOTE and STRING_LITERAL_LONG_SINGLE_QUOTE
cannam@226 517 // Initial triple quotes are already eaten by caller
cannam@226 518 static Ref
cannam@226 519 read_STRING_LITERAL_LONG(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
cannam@226 520 {
cannam@226 521 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
cannam@226 522 while (true) {
cannam@226 523 const uint8_t c = peek_byte(reader);
cannam@226 524 uint32_t code;
cannam@226 525 switch (c) {
cannam@226 526 case '\\':
cannam@226 527 eat_byte_safe(reader, c);
cannam@226 528 if (!read_ECHAR(reader, ref, flags) &&
cannam@226 529 !read_UCHAR(reader, ref, &code)) {
cannam@226 530 r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 531 "invalid escape `\\%c'\n", peek_byte(reader));
cannam@226 532 return pop_node(reader, ref);
cannam@226 533 }
cannam@226 534 break;
cannam@226 535 default:
cannam@226 536 if (c == q) {
cannam@226 537 eat_byte_safe(reader, q);
cannam@226 538 const uint8_t q2 = eat_byte_safe(reader, peek_byte(reader));
cannam@226 539 const uint8_t q3 = peek_byte(reader);
cannam@226 540 if (q2 == q && q3 == q) { // End of string
cannam@226 541 eat_byte_safe(reader, q3);
cannam@226 542 return ref;
cannam@226 543 } else {
cannam@226 544 *flags |= SERD_HAS_QUOTE;
cannam@226 545 push_byte(reader, ref, c);
cannam@226 546 read_character(reader, ref, flags, q2);
cannam@226 547 }
cannam@226 548 } else {
cannam@226 549 read_character(reader, ref, flags, eat_byte_safe(reader, c));
cannam@226 550 }
cannam@226 551 }
cannam@226 552 }
cannam@226 553 return ref;
cannam@226 554 }
cannam@226 555
cannam@226 556 // STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE
cannam@226 557 // Initial quote is already eaten by caller
cannam@226 558 static Ref
cannam@226 559 read_STRING_LITERAL(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
cannam@226 560 {
cannam@226 561 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
cannam@226 562 while (true) {
cannam@226 563 const uint8_t c = peek_byte(reader);
cannam@226 564 uint32_t code;
cannam@226 565 switch (c) {
cannam@226 566 case '\n': case '\r':
cannam@226 567 r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string\n");
cannam@226 568 return pop_node(reader, ref);
cannam@226 569 case '\\':
cannam@226 570 eat_byte_safe(reader, c);
cannam@226 571 if (!read_ECHAR(reader, ref, flags) &&
cannam@226 572 !read_UCHAR(reader, ref, &code)) {
cannam@226 573 r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 574 "invalid escape `\\%c'\n", peek_byte(reader));
cannam@226 575 return pop_node(reader, ref);
cannam@226 576 }
cannam@226 577 break;
cannam@226 578 default:
cannam@226 579 if (c == q) {
cannam@226 580 eat_byte_check(reader, q);
cannam@226 581 return ref;
cannam@226 582 } else {
cannam@226 583 read_character(reader, ref, flags, eat_byte_safe(reader, c));
cannam@226 584 }
cannam@226 585 }
cannam@226 586 }
cannam@226 587 eat_byte_check(reader, q);
cannam@226 588 return ref;
cannam@226 589 }
cannam@226 590
cannam@226 591 static Ref
cannam@226 592 read_String(SerdReader* reader, SerdNodeFlags* flags)
cannam@226 593 {
cannam@226 594 const uint8_t q1 = peek_byte(reader);
cannam@226 595 eat_byte_safe(reader, q1);
cannam@226 596
cannam@226 597 const uint8_t q2 = peek_byte(reader);
cannam@226 598 if (q2 != q1) { // Short string (not triple quoted)
cannam@226 599 return read_STRING_LITERAL(reader, flags, q1);
cannam@226 600 }
cannam@226 601
cannam@226 602 eat_byte_safe(reader, q2);
cannam@226 603 const uint8_t q3 = peek_byte(reader);
cannam@226 604 if (q3 != q1) { // Empty short string ("" or '')
cannam@226 605 return push_node(reader, SERD_LITERAL, "", 0);
cannam@226 606 }
cannam@226 607
cannam@226 608 if (!supports_fancy_literals(reader)) {
cannam@226 609 return r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 610 "syntax does not support long literals\n");
cannam@226 611 }
cannam@226 612
cannam@226 613 eat_byte_safe(reader, q3);
cannam@226 614 return read_STRING_LITERAL_LONG(reader, flags, q1);
cannam@226 615 }
cannam@226 616
cannam@226 617 static bool
cannam@226 618 read_PN_CHARS_BASE(SerdReader* reader, Ref dest)
cannam@226 619 {
cannam@226 620 const uint8_t c = peek_byte(reader);
cannam@226 621 if ((c & 0x80)) { // Multi-byte character
cannam@226 622 return !read_utf8_character(reader, dest, eat_byte_safe(reader, c));
cannam@226 623 }
cannam@226 624 if (is_alpha(c)) {
cannam@226 625 push_byte(reader, dest, eat_byte_safe(reader, c));
cannam@226 626 return true;
cannam@226 627 }
cannam@226 628 return false;
cannam@226 629 }
cannam@226 630
cannam@226 631 static bool
cannam@226 632 read_PN_CHARS(SerdReader* reader, Ref dest)
cannam@226 633 {
cannam@226 634 const uint8_t c = peek_byte(reader);
cannam@226 635 if ((c & 0x80)) { // Multi-byte character
cannam@226 636 return !read_utf8_character(reader, dest, eat_byte_safe(reader, c));
cannam@226 637 }
cannam@226 638
cannam@226 639 if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') {
cannam@226 640 push_byte(reader, dest, eat_byte_safe(reader, c));
cannam@226 641 return true;
cannam@226 642 }
cannam@226 643 return false;
cannam@226 644 }
cannam@226 645
cannam@226 646 static bool
cannam@226 647 read_PERCENT(SerdReader* reader, Ref dest)
cannam@226 648 {
cannam@226 649 push_byte(reader, dest, eat_byte_safe(reader, '%'));
cannam@226 650 const uint8_t h1 = read_HEX(reader);
cannam@226 651 const uint8_t h2 = read_HEX(reader);
cannam@226 652 if (h1 && h2) {
cannam@226 653 push_byte(reader, dest, h1);
cannam@226 654 push_byte(reader, dest, h2);
cannam@226 655 return true;
cannam@226 656 }
cannam@226 657 return false;
cannam@226 658 }
cannam@226 659
cannam@226 660 static SerdStatus
cannam@226 661 read_PLX(SerdReader* reader, Ref dest)
cannam@226 662 {
cannam@226 663 uint8_t c = peek_byte(reader);
cannam@226 664 switch (c) {
cannam@226 665 case '%':
cannam@226 666 if (!read_PERCENT(reader, dest)) {
cannam@226 667 return SERD_ERR_BAD_SYNTAX;
cannam@226 668 }
cannam@226 669 return SERD_SUCCESS;
cannam@226 670 case '\\':
cannam@226 671 eat_byte_safe(reader, c);
cannam@226 672 if (is_alpha(c = peek_byte(reader))) {
cannam@226 673 // Escapes like \u \n etc. are not supported
cannam@226 674 return SERD_ERR_BAD_SYNTAX;
cannam@226 675 } else {
cannam@226 676 // Allow escaping of pretty much any other character
cannam@226 677 push_byte(reader, dest, eat_byte_safe(reader, c));
cannam@226 678 return SERD_SUCCESS;
cannam@226 679 }
cannam@226 680 default:
cannam@226 681 return SERD_FAILURE;
cannam@226 682 }
cannam@226 683 }
cannam@226 684
cannam@226 685 static SerdStatus
cannam@226 686 read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot)
cannam@226 687 {
cannam@226 688 uint8_t c = peek_byte(reader);
cannam@226 689 SerdStatus st;
cannam@226 690 switch (c) {
cannam@226 691 case '0': case '1': case '2': case '3': case '4': case '5':
cannam@226 692 case '6': case '7': case '8': case '9': case ':': case '_':
cannam@226 693 push_byte(reader, dest, eat_byte_safe(reader, c));
cannam@226 694 break;
cannam@226 695 default:
cannam@226 696 if ((st = read_PLX(reader, dest)) > SERD_FAILURE) {
cannam@226 697 return st;
cannam@226 698 } else if (st != SERD_SUCCESS && !read_PN_CHARS_BASE(reader, dest)) {
cannam@226 699 return SERD_FAILURE;
cannam@226 700 }
cannam@226 701 }
cannam@226 702
cannam@226 703 while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.' | ';')*
cannam@226 704 if (c == '.' || c == ':') {
cannam@226 705 push_byte(reader, dest, eat_byte_safe(reader, c));
cannam@226 706 } else if ((st = read_PLX(reader, dest)) > SERD_FAILURE) {
cannam@226 707 return st;
cannam@226 708 } else if (st != SERD_SUCCESS && !read_PN_CHARS(reader, dest)) {
cannam@226 709 break;
cannam@226 710 }
cannam@226 711 }
cannam@226 712
cannam@226 713 SerdNode* const n = deref(reader, dest);
cannam@226 714 if (n->buf[n->n_bytes - 1] == '.') {
cannam@226 715 // Ate trailing dot, pop it from stack/node and inform caller
cannam@226 716 --n->n_bytes;
cannam@226 717 serd_stack_pop(&reader->stack, 1);
cannam@226 718 *ate_dot = true;
cannam@226 719 }
cannam@226 720
cannam@226 721 return SERD_SUCCESS;
cannam@226 722 }
cannam@226 723
cannam@226 724 // Read the remainder of a PN_PREFIX after some initial characters
cannam@226 725 static SerdStatus
cannam@226 726 read_PN_PREFIX_tail(SerdReader* reader, Ref dest)
cannam@226 727 {
cannam@226 728 uint8_t c;
cannam@226 729 while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')*
cannam@226 730 if (c == '.') {
cannam@226 731 push_byte(reader, dest, eat_byte_safe(reader, c));
cannam@226 732 } else if (!read_PN_CHARS(reader, dest)) {
cannam@226 733 break;
cannam@226 734 }
cannam@226 735 }
cannam@226 736
cannam@226 737 const SerdNode* const n = deref(reader, dest);
cannam@226 738 if (n->buf[n->n_bytes - 1] == '.' && !read_PN_CHARS(reader, dest)) {
cannam@226 739 r_err(reader, SERD_ERR_BAD_SYNTAX, "prefix ends with `.'\n");
cannam@226 740 return SERD_ERR_BAD_SYNTAX;
cannam@226 741 }
cannam@226 742
cannam@226 743 return SERD_SUCCESS;
cannam@226 744 }
cannam@226 745
cannam@226 746 static SerdStatus
cannam@226 747 read_PN_PREFIX(SerdReader* reader, Ref dest)
cannam@226 748 {
cannam@226 749 if (read_PN_CHARS_BASE(reader, dest)) {
cannam@226 750 return read_PN_PREFIX_tail(reader, dest);
cannam@226 751 }
cannam@226 752 return SERD_FAILURE;
cannam@226 753 }
cannam@226 754
cannam@226 755 static Ref
cannam@226 756 read_LANGTAG(SerdReader* reader)
cannam@226 757 {
cannam@226 758 uint8_t c = peek_byte(reader);
cannam@226 759 if (!is_alpha(c)) {
cannam@226 760 return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'\n", c);
cannam@226 761 }
cannam@226 762 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
cannam@226 763 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 764 while ((c = peek_byte(reader)) && is_alpha(c)) {
cannam@226 765 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 766 }
cannam@226 767 while (peek_byte(reader) == '-') {
cannam@226 768 push_byte(reader, ref, eat_byte_safe(reader, '-'));
cannam@226 769 while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) {
cannam@226 770 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 771 }
cannam@226 772 }
cannam@226 773 return ref;
cannam@226 774 }
cannam@226 775
cannam@226 776 typedef enum { PREFIX, GOOD, BAD} SchemeState;
cannam@226 777
cannam@226 778 static inline bool
cannam@226 779 check_scheme(SerdReader* reader, uint8_t c, SchemeState* state)
cannam@226 780 {
cannam@226 781 if (!supports_relative_iris(reader) && *state == PREFIX) {
cannam@226 782 if (c == ':') {
cannam@226 783 *state = GOOD;
cannam@226 784 } else if (!isalpha(c)) {
cannam@226 785 *state = BAD;
cannam@226 786 return r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 787 "syntax does not support relative IRIs\n");
cannam@226 788 }
cannam@226 789 }
cannam@226 790 return true;
cannam@226 791 }
cannam@226 792
cannam@226 793 static Ref
cannam@226 794 read_IRIREF(SerdReader* reader)
cannam@226 795 {
cannam@226 796 TRY_RET(eat_byte_check(reader, '<'));
cannam@226 797 Ref ref = push_node(reader, SERD_URI, "", 0);
cannam@226 798 SchemeState scheme = PREFIX;
cannam@226 799 uint32_t code;
cannam@226 800 while (true) {
cannam@226 801 const uint8_t c = peek_byte(reader);
cannam@226 802 if (!check_scheme(reader, c, &scheme)) {
cannam@226 803 return pop_node(reader, ref);
cannam@226 804 }
cannam@226 805 switch (c) {
cannam@226 806 case '"': case '<': case '^': case '`': case '{': case '|': case '}':
cannam@226 807 r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 808 "invalid IRI character `%c'\n", c);
cannam@226 809 return pop_node(reader, ref);
cannam@226 810 case '>':
cannam@226 811 eat_byte_safe(reader, c);
cannam@226 812 return ref;
cannam@226 813 case '\\':
cannam@226 814 eat_byte_safe(reader, c);
cannam@226 815 if (!read_UCHAR(reader, ref, &code)) {
cannam@226 816 r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape\n");
cannam@226 817 return pop_node(reader, ref);
cannam@226 818 }
cannam@226 819 switch (code) {
cannam@226 820 case 0: case ' ': case '<': case '>':
cannam@226 821 r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 822 "invalid escaped IRI character %X %c\n", code, code);
cannam@226 823 return pop_node(reader, ref);
cannam@226 824 }
cannam@226 825 break;
cannam@226 826 default:
cannam@226 827 if (c <= 0x20) {
cannam@226 828 if (isprint(c)) {
cannam@226 829 r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 830 "invalid IRI character `%c' (escape %%%02X)\n", c, c);
cannam@226 831 } else {
cannam@226 832 r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 833 "invalid IRI character (escape %%%02X)\n", c, c);
cannam@226 834 }
cannam@226 835 if (reader->strict) {
cannam@226 836 return pop_node(reader, ref);
cannam@226 837 }
cannam@226 838 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 839 } else {
cannam@226 840 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 841 }
cannam@226 842 }
cannam@226 843 }
cannam@226 844 }
cannam@226 845
cannam@226 846 static bool
cannam@226 847 read_PrefixedName(SerdReader* reader, Ref dest, bool read_prefix, bool* ate_dot)
cannam@226 848 {
cannam@226 849 if (read_prefix && read_PN_PREFIX(reader, dest) > SERD_FAILURE) {
cannam@226 850 return false;
cannam@226 851 } else if (peek_byte(reader) != ':') {
cannam@226 852 return false;
cannam@226 853 }
cannam@226 854
cannam@226 855 push_byte(reader, dest, eat_byte_safe(reader, ':'));
cannam@226 856 return read_PN_LOCAL(reader, dest, ate_dot) <= SERD_FAILURE;
cannam@226 857 }
cannam@226 858
cannam@226 859 static bool
cannam@226 860 read_0_9(SerdReader* reader, Ref str, bool at_least_one)
cannam@226 861 {
cannam@226 862 unsigned count = 0;
cannam@226 863 for (uint8_t c; is_digit((c = peek_byte(reader))); ++count) {
cannam@226 864 push_byte(reader, str, eat_byte_safe(reader, c));
cannam@226 865 }
cannam@226 866 if (at_least_one && count == 0) {
cannam@226 867 r_err(reader, SERD_ERR_BAD_SYNTAX, "expected digit\n");
cannam@226 868 }
cannam@226 869 return count;
cannam@226 870 }
cannam@226 871
cannam@226 872 static bool
cannam@226 873 read_number(SerdReader* reader, Ref* dest, Ref* datatype, bool* ate_dot)
cannam@226 874 {
cannam@226 875 #define XSD_DECIMAL NS_XSD "decimal"
cannam@226 876 #define XSD_DOUBLE NS_XSD "double"
cannam@226 877 #define XSD_INTEGER NS_XSD "integer"
cannam@226 878 Ref ref = push_node(reader, SERD_LITERAL, "", 0);
cannam@226 879 uint8_t c = peek_byte(reader);
cannam@226 880 bool has_decimal = false;
cannam@226 881 if (c == '-' || c == '+') {
cannam@226 882 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 883 }
cannam@226 884 if ((c = peek_byte(reader)) == '.') {
cannam@226 885 has_decimal = true;
cannam@226 886 // decimal case 2 (e.g. '.0' or `-.0' or `+.0')
cannam@226 887 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 888 TRY_THROW(read_0_9(reader, ref, true));
cannam@226 889 } else {
cannam@226 890 // all other cases ::= ( '-' | '+' ) [0-9]+ ( . )? ( [0-9]+ )? ...
cannam@226 891 TRY_THROW(is_digit(c));
cannam@226 892 read_0_9(reader, ref, true);
cannam@226 893 if ((c = peek_byte(reader)) == '.') {
cannam@226 894 has_decimal = true;
cannam@226 895
cannam@226 896 // Annoyingly, dot can be end of statement, so tentatively eat
cannam@226 897 eat_byte_safe(reader, c);
cannam@226 898 c = peek_byte(reader);
cannam@226 899 if (!is_digit(c) && c != 'e' && c != 'E') {
cannam@226 900 *dest = ref;
cannam@226 901 *ate_dot = true; // Force caller to deal with stupid grammar
cannam@226 902 return true; // Next byte is not a number character, done
cannam@226 903 }
cannam@226 904
cannam@226 905 push_byte(reader, ref, '.');
cannam@226 906 read_0_9(reader, ref, false);
cannam@226 907 }
cannam@226 908 }
cannam@226 909 c = peek_byte(reader);
cannam@226 910 if (c == 'e' || c == 'E') {
cannam@226 911 // double
cannam@226 912 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 913 switch ((c = peek_byte(reader))) {
cannam@226 914 case '+': case '-':
cannam@226 915 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 916 default: break;
cannam@226 917 }
cannam@226 918 TRY_THROW(read_0_9(reader, ref, true));
cannam@226 919 *datatype = push_node(reader, SERD_URI,
cannam@226 920 XSD_DOUBLE, sizeof(XSD_DOUBLE) - 1);
cannam@226 921 } else if (has_decimal) {
cannam@226 922 *datatype = push_node(reader, SERD_URI,
cannam@226 923 XSD_DECIMAL, sizeof(XSD_DECIMAL) - 1);
cannam@226 924 } else {
cannam@226 925 *datatype = push_node(reader, SERD_URI,
cannam@226 926 XSD_INTEGER, sizeof(XSD_INTEGER) - 1);
cannam@226 927 }
cannam@226 928 *dest = ref;
cannam@226 929 return true;
cannam@226 930 except:
cannam@226 931 pop_node(reader, *datatype);
cannam@226 932 pop_node(reader, ref);
cannam@226 933 return false;
cannam@226 934 }
cannam@226 935
cannam@226 936 static bool
cannam@226 937 read_iri(SerdReader* reader, Ref* dest, bool* ate_dot)
cannam@226 938 {
cannam@226 939 switch (peek_byte(reader)) {
cannam@226 940 case '<':
cannam@226 941 *dest = read_IRIREF(reader);
cannam@226 942 return true;
cannam@226 943 default:
cannam@226 944 *dest = push_node(reader, SERD_CURIE, "", 0);
cannam@226 945 return read_PrefixedName(reader, *dest, true, ate_dot);
cannam@226 946 }
cannam@226 947 }
cannam@226 948
cannam@226 949 static bool
cannam@226 950 read_literal(SerdReader* reader, Ref* dest,
cannam@226 951 Ref* datatype, Ref* lang, SerdNodeFlags* flags, bool* ate_dot)
cannam@226 952 {
cannam@226 953 Ref str = read_String(reader, flags);
cannam@226 954 if (!str) {
cannam@226 955 return false;
cannam@226 956 }
cannam@226 957
cannam@226 958 switch (peek_byte(reader)) {
cannam@226 959 case '@':
cannam@226 960 eat_byte_safe(reader, '@');
cannam@226 961 TRY_THROW(*lang = read_LANGTAG(reader));
cannam@226 962 break;
cannam@226 963 case '^':
cannam@226 964 eat_byte_safe(reader, '^');
cannam@226 965 eat_byte_check(reader, '^');
cannam@226 966 TRY_THROW(read_iri(reader, datatype, ate_dot));
cannam@226 967 break;
cannam@226 968 }
cannam@226 969 *dest = str;
cannam@226 970 return true;
cannam@226 971 except:
cannam@226 972 *datatype = pop_node(reader, *datatype);
cannam@226 973 *lang = pop_node(reader, *lang);
cannam@226 974 pop_node(reader, str);
cannam@226 975 return false;
cannam@226 976 }
cannam@226 977
cannam@226 978 inline static bool
cannam@226 979 is_token_end(uint8_t c)
cannam@226 980 {
cannam@226 981 switch (c) {
cannam@226 982 case 0x9: case 0xA: case 0xD: case 0x20: case '\0':
cannam@226 983 case '#': case '.': case ';': case '<':
cannam@226 984 return true;
cannam@226 985 default:
cannam@226 986 return false;
cannam@226 987 }
cannam@226 988 }
cannam@226 989
cannam@226 990 static bool
cannam@226 991 read_verb(SerdReader* reader, Ref* dest)
cannam@226 992 {
cannam@226 993 if (peek_byte(reader) == '<') {
cannam@226 994 return (*dest = read_IRIREF(reader));
cannam@226 995 } else {
cannam@226 996 /* Either a qname, or "a". Read the prefix first, and if it is in fact
cannam@226 997 "a", produce that instead.
cannam@226 998 */
cannam@226 999 *dest = push_node(reader, SERD_CURIE, "", 0);
cannam@226 1000 SerdNode* node = deref(reader, *dest);
cannam@226 1001 const SerdStatus st = read_PN_PREFIX(reader, *dest);
cannam@226 1002 bool ate_dot = false;
cannam@226 1003 if (!st && node->n_bytes == 1 && node->buf[0] == 'a' &&
cannam@226 1004 is_token_end(peek_byte(reader))) {
cannam@226 1005 pop_node(reader, *dest);
cannam@226 1006 return (*dest = push_node(reader, SERD_URI, NS_RDF "type", 47));
cannam@226 1007 } else if (st > SERD_FAILURE ||
cannam@226 1008 !read_PrefixedName(reader, *dest, false, &ate_dot) ||
cannam@226 1009 ate_dot) {
cannam@226 1010 return (*dest = pop_node(reader, *dest));
cannam@226 1011 } else {
cannam@226 1012 return true;
cannam@226 1013 }
cannam@226 1014 }
cannam@226 1015 return false;
cannam@226 1016 }
cannam@226 1017
cannam@226 1018 static Ref
cannam@226 1019 read_BLANK_NODE_LABEL(SerdReader* reader, bool* ate_dot)
cannam@226 1020 {
cannam@226 1021 eat_byte_safe(reader, '_');
cannam@226 1022 eat_byte_check(reader, ':');
cannam@226 1023 Ref ref = push_node(reader, SERD_BLANK,
cannam@226 1024 reader->bprefix ? (char*)reader->bprefix : "",
cannam@226 1025 reader->bprefix_len);
cannam@226 1026
cannam@226 1027 uint8_t c = peek_byte(reader); // First: (PN_CHARS | '_' | [0-9])
cannam@226 1028 if (is_digit(c) || c == '_') {
cannam@226 1029 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 1030 } else if (!read_PN_CHARS(reader, ref)) {
cannam@226 1031 r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid name start character\n");
cannam@226 1032 return pop_node(reader, ref);
cannam@226 1033 }
cannam@226 1034
cannam@226 1035 while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')*
cannam@226 1036 if (c == '.') {
cannam@226 1037 push_byte(reader, ref, eat_byte_safe(reader, c));
cannam@226 1038 } else if (!read_PN_CHARS(reader, ref)) {
cannam@226 1039 break;
cannam@226 1040 }
cannam@226 1041 }
cannam@226 1042
cannam@226 1043 SerdNode* n = deref(reader, ref);
cannam@226 1044 if (n->buf[n->n_bytes - 1] == '.' && !read_PN_CHARS(reader, ref)) {
cannam@226 1045 // Ate trailing dot, pop it from stack/node and inform caller
cannam@226 1046 --n->n_bytes;
cannam@226 1047 serd_stack_pop(&reader->stack, 1);
cannam@226 1048 *ate_dot = true;
cannam@226 1049 }
cannam@226 1050
cannam@226 1051 if (reader->syntax == SERD_TURTLE) {
cannam@226 1052 if (is_digit(n->buf[reader->bprefix_len + 1])) {
cannam@226 1053 if ((n->buf[reader->bprefix_len]) == 'b') {
cannam@226 1054 ((char*)n->buf)[reader->bprefix_len] = 'B'; // Prevent clash
cannam@226 1055 reader->seen_genid = true;
cannam@226 1056 } else if (reader->seen_genid &&
cannam@226 1057 n->buf[reader->bprefix_len] == 'B') {
cannam@226 1058 r_err(reader, SERD_ERR_ID_CLASH,
cannam@226 1059 "found both `b' and `B' blank IDs, prefix required\n");
cannam@226 1060 return pop_node(reader, ref);
cannam@226 1061 }
cannam@226 1062 }
cannam@226 1063 }
cannam@226 1064 return ref;
cannam@226 1065 }
cannam@226 1066
cannam@226 1067 static void
cannam@226 1068 set_blank_id(SerdReader* reader, Ref ref, size_t buf_size)
cannam@226 1069 {
cannam@226 1070 SerdNode* node = deref(reader, ref);
cannam@226 1071 const char* prefix = reader->bprefix ? (const char*)reader->bprefix : "";
cannam@226 1072 node->n_bytes = node->n_chars = snprintf(
cannam@226 1073 (char*)node->buf, buf_size, "%sb%u", prefix, reader->next_id++);
cannam@226 1074 }
cannam@226 1075
cannam@226 1076 static size_t
cannam@226 1077 genid_size(SerdReader* reader)
cannam@226 1078 {
cannam@226 1079 return reader->bprefix_len + 1 + 10 + 1; // + "b" + UINT32_MAX + \0
cannam@226 1080 }
cannam@226 1081
cannam@226 1082 static Ref
cannam@226 1083 blank_id(SerdReader* reader)
cannam@226 1084 {
cannam@226 1085 Ref ref = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0);
cannam@226 1086 set_blank_id(reader, ref, genid_size(reader));
cannam@226 1087 return ref;
cannam@226 1088 }
cannam@226 1089
cannam@226 1090 static Ref
cannam@226 1091 read_blankName(SerdReader* reader)
cannam@226 1092 {
cannam@226 1093 eat_byte_safe(reader, '=');
cannam@226 1094 if (eat_byte_check(reader, '=') != '=') {
cannam@226 1095 return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected `='\n");
cannam@226 1096 }
cannam@226 1097
cannam@226 1098 Ref subject = 0;
cannam@226 1099 bool ate_dot = false;
cannam@226 1100 read_ws_star(reader);
cannam@226 1101 read_iri(reader, &subject, &ate_dot);
cannam@226 1102 return subject;
cannam@226 1103 }
cannam@226 1104
cannam@226 1105 static bool
cannam@226 1106 read_anon(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest)
cannam@226 1107 {
cannam@226 1108 const SerdStatementFlags old_flags = *ctx.flags;
cannam@226 1109 bool empty;
cannam@226 1110 eat_byte_safe(reader, '[');
cannam@226 1111 if ((empty = peek_delim(reader, ']'))) {
cannam@226 1112 *ctx.flags |= (subject) ? SERD_EMPTY_S : SERD_EMPTY_O;
cannam@226 1113 } else {
cannam@226 1114 *ctx.flags |= (subject) ? SERD_ANON_S_BEGIN : SERD_ANON_O_BEGIN;
cannam@226 1115 if (peek_delim(reader, '=')) {
cannam@226 1116 if (!(*dest = read_blankName(reader)) ||
cannam@226 1117 !eat_delim(reader, ';')) {
cannam@226 1118 return false;
cannam@226 1119 }
cannam@226 1120 }
cannam@226 1121 }
cannam@226 1122
cannam@226 1123 if (!*dest) {
cannam@226 1124 *dest = blank_id(reader);
cannam@226 1125 }
cannam@226 1126 if (ctx.subject) {
cannam@226 1127 TRY_RET(emit_statement(reader, ctx, *dest, 0, 0));
cannam@226 1128 }
cannam@226 1129
cannam@226 1130 ctx.subject = *dest;
cannam@226 1131 if (!empty) {
cannam@226 1132 *ctx.flags &= ~(SERD_LIST_CONT);
cannam@226 1133 if (!subject) {
cannam@226 1134 *ctx.flags |= SERD_ANON_CONT;
cannam@226 1135 }
cannam@226 1136 bool ate_dot_in_list = false;
cannam@226 1137 read_predicateObjectList(reader, ctx, &ate_dot_in_list);
cannam@226 1138 if (ate_dot_in_list) {
cannam@226 1139 return r_err(reader, SERD_ERR_BAD_SYNTAX, "`.' inside blank\n");
cannam@226 1140 }
cannam@226 1141 read_ws_star(reader);
cannam@226 1142 if (reader->end_sink) {
cannam@226 1143 reader->end_sink(reader->handle, deref(reader, *dest));
cannam@226 1144 }
cannam@226 1145 *ctx.flags = old_flags;
cannam@226 1146 }
cannam@226 1147 return (eat_byte_check(reader, ']') == ']');
cannam@226 1148 }
cannam@226 1149
cannam@226 1150 /* If emit is true: recurses, calling statement_sink for every statement
cannam@226 1151 encountered, and leaves stack in original calling state (i.e. pops
cannam@226 1152 everything it pushes). */
cannam@226 1153 static bool
cannam@226 1154 read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot)
cannam@226 1155 {
cannam@226 1156 static const char* const XSD_BOOLEAN = NS_XSD "boolean";
cannam@226 1157 static const size_t XSD_BOOLEAN_LEN = 40;
cannam@226 1158
cannam@226 1159 #ifndef NDEBUG
cannam@226 1160 const size_t orig_stack_size = reader->stack.size;
cannam@226 1161 #endif
cannam@226 1162
cannam@226 1163 bool ret = false;
cannam@226 1164 bool simple = (ctx->subject != 0);
cannam@226 1165 SerdNode* node = NULL;
cannam@226 1166 Ref o = 0;
cannam@226 1167 Ref datatype = 0;
cannam@226 1168 Ref lang = 0;
cannam@226 1169 uint32_t flags = 0;
cannam@226 1170 const uint8_t c = peek_byte(reader);
cannam@226 1171 if (!supports_fancy_literals(reader)) {
cannam@226 1172 switch (c) {
cannam@226 1173 case '"': case ':': case '<': case '_': break;
cannam@226 1174 default: return r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 1175 "expected: ':', '<', or '_'\n");
cannam@226 1176 }
cannam@226 1177 }
cannam@226 1178 switch (c) {
cannam@226 1179 case '\0':
cannam@226 1180 case ')':
cannam@226 1181 return false;
cannam@226 1182 case '[':
cannam@226 1183 simple = false;
cannam@226 1184 TRY_THROW(ret = read_anon(reader, *ctx, false, &o));
cannam@226 1185 break;
cannam@226 1186 case '(':
cannam@226 1187 simple = false;
cannam@226 1188 TRY_THROW(ret = read_collection(reader, *ctx, &o));
cannam@226 1189 break;
cannam@226 1190 case '_':
cannam@226 1191 TRY_THROW(ret = (o = read_BLANK_NODE_LABEL(reader, ate_dot)));
cannam@226 1192 break;
cannam@226 1193 case '<': case ':':
cannam@226 1194 TRY_THROW(ret = read_iri(reader, &o, ate_dot));
cannam@226 1195 break;
cannam@226 1196 case '+': case '-': case '.': case '0': case '1': case '2': case '3':
cannam@226 1197 case '4': case '5': case '6': case '7': case '8': case '9':
cannam@226 1198 TRY_THROW(ret = read_number(reader, &o, &datatype, ate_dot));
cannam@226 1199 break;
cannam@226 1200 case '\"':
cannam@226 1201 case '\'':
cannam@226 1202 TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags, ate_dot));
cannam@226 1203 break;
cannam@226 1204 default:
cannam@226 1205 /* Either a boolean literal, or a qname. Read the prefix first, and if
cannam@226 1206 it is in fact a "true" or "false" literal, produce that instead.
cannam@226 1207 */
cannam@226 1208 node = deref(reader, o = push_node(reader, SERD_CURIE, "", 0));
cannam@226 1209 while (read_PN_CHARS_BASE(reader, o)) {}
cannam@226 1210 if ((node->n_bytes == 4 && !memcmp(node->buf, "true", 4)) ||
cannam@226 1211 (node->n_bytes == 5 && !memcmp(node->buf, "false", 5))) {
cannam@226 1212 node->type = SERD_LITERAL;
cannam@226 1213 datatype = push_node(
cannam@226 1214 reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN);
cannam@226 1215 ret = true;
cannam@226 1216 } else if (read_PN_PREFIX_tail(reader, o) > SERD_FAILURE) {
cannam@226 1217 ret = false;
cannam@226 1218 } else {
cannam@226 1219 ret = read_PrefixedName(reader, o, false, ate_dot);
cannam@226 1220 }
cannam@226 1221 }
cannam@226 1222
cannam@226 1223 if (simple && o) {
cannam@226 1224 deref(reader, o)->flags = flags;
cannam@226 1225 }
cannam@226 1226
cannam@226 1227 if (ret && emit && simple) {
cannam@226 1228 ret = emit_statement(reader, *ctx, o, datatype, lang);
cannam@226 1229 } else if (ret && !emit) {
cannam@226 1230 ctx->object = o;
cannam@226 1231 ctx->datatype = datatype;
cannam@226 1232 ctx->lang = lang;
cannam@226 1233 return true;
cannam@226 1234 }
cannam@226 1235
cannam@226 1236 except:
cannam@226 1237 pop_node(reader, lang);
cannam@226 1238 pop_node(reader, datatype);
cannam@226 1239 pop_node(reader, o);
cannam@226 1240 #ifndef NDEBUG
cannam@226 1241 assert(reader->stack.size == orig_stack_size);
cannam@226 1242 #endif
cannam@226 1243 return ret;
cannam@226 1244 }
cannam@226 1245
cannam@226 1246 static bool
cannam@226 1247 read_objectList(SerdReader* reader, ReadContext ctx, bool* ate_dot)
cannam@226 1248 {
cannam@226 1249 TRY_RET(read_object(reader, &ctx, true, ate_dot));
cannam@226 1250 while (!*ate_dot && eat_delim(reader, ',')) {
cannam@226 1251 TRY_RET(read_object(reader, &ctx, true, ate_dot));
cannam@226 1252 }
cannam@226 1253 return true;
cannam@226 1254 }
cannam@226 1255
cannam@226 1256 static bool
cannam@226 1257 read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot)
cannam@226 1258 {
cannam@226 1259 uint8_t c;
cannam@226 1260 while (true) {
cannam@226 1261 TRY_THROW(read_verb(reader, &ctx.predicate));
cannam@226 1262 read_ws_star(reader);
cannam@226 1263
cannam@226 1264 TRY_THROW(read_objectList(reader, ctx, ate_dot));
cannam@226 1265 ctx.predicate = pop_node(reader, ctx.predicate);
cannam@226 1266 if (*ate_dot) {
cannam@226 1267 return true;
cannam@226 1268 }
cannam@226 1269
cannam@226 1270 bool ate_semi = false;
cannam@226 1271 do {
cannam@226 1272 read_ws_star(reader);
cannam@226 1273 switch (c = peek_byte(reader)) {
cannam@226 1274 case 0:
cannam@226 1275 return false;
cannam@226 1276 case '.': case ']': case '}':
cannam@226 1277 return true;
cannam@226 1278 case ';':
cannam@226 1279 eat_byte_safe(reader, c);
cannam@226 1280 ate_semi = true;
cannam@226 1281 }
cannam@226 1282 } while (c == ';');
cannam@226 1283
cannam@226 1284 if (!ate_semi) {
cannam@226 1285 return r_err(reader, SERD_ERR_BAD_SYNTAX, "missing ';' or '.'\n");
cannam@226 1286 }
cannam@226 1287 }
cannam@226 1288
cannam@226 1289 pop_node(reader, ctx.predicate);
cannam@226 1290 return true;
cannam@226 1291 except:
cannam@226 1292 pop_node(reader, ctx.predicate);
cannam@226 1293 return false;
cannam@226 1294 }
cannam@226 1295
cannam@226 1296 static bool
cannam@226 1297 end_collection(SerdReader* reader, ReadContext ctx, Ref n1, Ref n2, bool ret)
cannam@226 1298 {
cannam@226 1299 pop_node(reader, n2);
cannam@226 1300 pop_node(reader, n1);
cannam@226 1301 *ctx.flags &= ~SERD_LIST_CONT;
cannam@226 1302 return ret && (eat_byte_safe(reader, ')') == ')');
cannam@226 1303 }
cannam@226 1304
cannam@226 1305 static bool
cannam@226 1306 read_collection(SerdReader* reader, ReadContext ctx, Ref* dest)
cannam@226 1307 {
cannam@226 1308 eat_byte_safe(reader, '(');
cannam@226 1309 bool end = peek_delim(reader, ')');
cannam@226 1310 *dest = end ? reader->rdf_nil : blank_id(reader);
cannam@226 1311 if (ctx.subject) {
cannam@226 1312 // subject predicate _:head
cannam@226 1313 *ctx.flags |= (end ? 0 : SERD_LIST_O_BEGIN);
cannam@226 1314 TRY_RET(emit_statement(reader, ctx, *dest, 0, 0));
cannam@226 1315 *ctx.flags |= SERD_LIST_CONT;
cannam@226 1316 } else {
cannam@226 1317 *ctx.flags |= (end ? 0 : SERD_LIST_S_BEGIN);
cannam@226 1318 }
cannam@226 1319
cannam@226 1320 if (end) {
cannam@226 1321 return end_collection(reader, ctx, 0, 0, true);
cannam@226 1322 }
cannam@226 1323
cannam@226 1324 /* The order of node allocation here is necessarily not in stack order,
cannam@226 1325 so we create two nodes and recycle them throughout. */
cannam@226 1326 Ref n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0);
cannam@226 1327 Ref n2 = 0;
cannam@226 1328 Ref node = n1;
cannam@226 1329 Ref rest = 0;
cannam@226 1330
cannam@226 1331 ctx.subject = *dest;
cannam@226 1332 while (!(end = peek_delim(reader, ')'))) {
cannam@226 1333 // _:node rdf:first object
cannam@226 1334 ctx.predicate = reader->rdf_first;
cannam@226 1335 bool ate_dot = false;
cannam@226 1336 if (!read_object(reader, &ctx, true, &ate_dot) || ate_dot) {
cannam@226 1337 return end_collection(reader, ctx, n1, n2, false);
cannam@226 1338 }
cannam@226 1339
cannam@226 1340 if (!(end = peek_delim(reader, ')'))) {
cannam@226 1341 /* Give rest a new ID. Done as late as possible to ensure it is
cannam@226 1342 used and > IDs generated by read_object above. */
cannam@226 1343 if (!rest) {
cannam@226 1344 rest = n2 = blank_id(reader); // First pass, push
cannam@226 1345 } else {
cannam@226 1346 set_blank_id(reader, rest, genid_size(reader));
cannam@226 1347 }
cannam@226 1348 }
cannam@226 1349
cannam@226 1350 // _:node rdf:rest _:rest
cannam@226 1351 *ctx.flags |= SERD_LIST_CONT;
cannam@226 1352 ctx.predicate = reader->rdf_rest;
cannam@226 1353 TRY_RET(emit_statement(reader, ctx,
cannam@226 1354 (end ? reader->rdf_nil : rest), 0, 0));
cannam@226 1355
cannam@226 1356 ctx.subject = rest; // _:node = _:rest
cannam@226 1357 rest = node; // _:rest = (old)_:node
cannam@226 1358 node = ctx.subject; // invariant
cannam@226 1359 }
cannam@226 1360
cannam@226 1361 return end_collection(reader, ctx, n1, n2, true);
cannam@226 1362 }
cannam@226 1363
cannam@226 1364 static Ref
cannam@226 1365 read_subject(SerdReader* reader, ReadContext ctx, Ref* dest, char* s_type)
cannam@226 1366 {
cannam@226 1367 bool ate_dot = false;
cannam@226 1368 switch ((*s_type = peek_byte(reader))) {
cannam@226 1369 case '[':
cannam@226 1370 read_anon(reader, ctx, true, dest);
cannam@226 1371 break;
cannam@226 1372 case '(':
cannam@226 1373 read_collection(reader, ctx, dest);
cannam@226 1374 break;
cannam@226 1375 case '_':
cannam@226 1376 *dest = read_BLANK_NODE_LABEL(reader, &ate_dot);
cannam@226 1377 break;
cannam@226 1378 default:
cannam@226 1379 TRY_RET(read_iri(reader, dest, &ate_dot));
cannam@226 1380 }
cannam@226 1381 return ate_dot ? pop_node(reader, *dest) : *dest;
cannam@226 1382 }
cannam@226 1383
cannam@226 1384 static Ref
cannam@226 1385 read_labelOrSubject(SerdReader* reader, ReadContext ctx)
cannam@226 1386 {
cannam@226 1387 Ref subject = 0;
cannam@226 1388 bool ate_dot = false;
cannam@226 1389 switch (peek_byte(reader)) {
cannam@226 1390 case '[':
cannam@226 1391 eat_byte_safe(reader, '[');
cannam@226 1392 read_ws_star(reader);
cannam@226 1393 TRY_RET(eat_byte_check(reader, ']'));
cannam@226 1394 return blank_id(reader);
cannam@226 1395 case '_':
cannam@226 1396 return read_BLANK_NODE_LABEL(reader, &ate_dot);
cannam@226 1397 default:
cannam@226 1398 read_iri(reader, &subject, &ate_dot);
cannam@226 1399 }
cannam@226 1400 return subject;
cannam@226 1401 }
cannam@226 1402
cannam@226 1403 static bool
cannam@226 1404 read_triples(SerdReader* reader, ReadContext ctx, bool* ate_dot)
cannam@226 1405 {
cannam@226 1406 bool ret = false;
cannam@226 1407 if (ctx.subject) {
cannam@226 1408 read_ws_star(reader);
cannam@226 1409 switch (peek_byte(reader)) {
cannam@226 1410 case '.':
cannam@226 1411 *ate_dot = eat_byte_safe(reader, '.');
cannam@226 1412 return false;
cannam@226 1413 case '}':
cannam@226 1414 return false;
cannam@226 1415 }
cannam@226 1416 ret = read_predicateObjectList(reader, ctx, ate_dot);
cannam@226 1417 }
cannam@226 1418 ctx.subject = ctx.predicate = 0;
cannam@226 1419 return ret;
cannam@226 1420 }
cannam@226 1421
cannam@226 1422 static bool
cannam@226 1423 read_base(SerdReader* reader, bool sparql, bool token)
cannam@226 1424 {
cannam@226 1425 if (token) {
cannam@226 1426 TRY_RET(eat_string(reader, "base", 4));
cannam@226 1427 }
cannam@226 1428
cannam@226 1429 Ref uri;
cannam@226 1430 read_ws_star(reader);
cannam@226 1431 TRY_RET(uri = read_IRIREF(reader));
cannam@226 1432 if (reader->base_sink) {
cannam@226 1433 reader->base_sink(reader->handle, deref(reader, uri));
cannam@226 1434 }
cannam@226 1435 pop_node(reader, uri);
cannam@226 1436
cannam@226 1437 read_ws_star(reader);
cannam@226 1438 if (!sparql) {
cannam@226 1439 return eat_byte_check(reader, '.');
cannam@226 1440 } else if (peek_byte(reader) == '.') {
cannam@226 1441 return r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 1442 "full stop after SPARQL BASE\n");
cannam@226 1443 }
cannam@226 1444 return true;
cannam@226 1445 }
cannam@226 1446
cannam@226 1447 static bool
cannam@226 1448 read_prefixID(SerdReader* reader, bool sparql, bool token)
cannam@226 1449 {
cannam@226 1450 if (token) {
cannam@226 1451 TRY_RET(eat_string(reader, "prefix", 6));
cannam@226 1452 }
cannam@226 1453
cannam@226 1454 read_ws_star(reader);
cannam@226 1455 bool ret = true;
cannam@226 1456 Ref name = push_node(reader, SERD_LITERAL, "", 0);
cannam@226 1457 if (read_PN_PREFIX(reader, name) > SERD_FAILURE) {
cannam@226 1458 return pop_node(reader, name);
cannam@226 1459 }
cannam@226 1460
cannam@226 1461 if (eat_byte_check(reader, ':') != ':') {
cannam@226 1462 return pop_node(reader, name);
cannam@226 1463 }
cannam@226 1464
cannam@226 1465 read_ws_star(reader);
cannam@226 1466 const Ref uri = read_IRIREF(reader);
cannam@226 1467 if (!uri) {
cannam@226 1468 pop_node(reader, name);
cannam@226 1469 return false;
cannam@226 1470 }
cannam@226 1471
cannam@226 1472 if (reader->prefix_sink) {
cannam@226 1473 ret = !reader->prefix_sink(reader->handle,
cannam@226 1474 deref(reader, name),
cannam@226 1475 deref(reader, uri));
cannam@226 1476 }
cannam@226 1477 pop_node(reader, uri);
cannam@226 1478 pop_node(reader, name);
cannam@226 1479 if (!sparql) {
cannam@226 1480 read_ws_star(reader);
cannam@226 1481 return eat_byte_check(reader, '.');
cannam@226 1482 }
cannam@226 1483 return ret;
cannam@226 1484 }
cannam@226 1485
cannam@226 1486 static bool
cannam@226 1487 read_directive(SerdReader* reader)
cannam@226 1488 {
cannam@226 1489 const bool sparql = peek_byte(reader) != '@';
cannam@226 1490 if (!sparql) {
cannam@226 1491 eat_byte_safe(reader, '@');
cannam@226 1492 switch (peek_byte(reader)) {
cannam@226 1493 case 'B': case 'P':
cannam@226 1494 return r_err(reader, SERD_ERR_BAD_SYNTAX,
cannam@226 1495 "uppercase directive\n");
cannam@226 1496 }
cannam@226 1497 }
cannam@226 1498
cannam@226 1499 switch (peek_byte(reader)) {
cannam@226 1500 case 'B': case 'b': return read_base(reader, sparql, true);
cannam@226 1501 case 'P': case 'p': return read_prefixID(reader, sparql, true);
cannam@226 1502 default:
cannam@226 1503 return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid directive\n");
cannam@226 1504 }
cannam@226 1505
cannam@226 1506 return true;
cannam@226 1507 }
cannam@226 1508
cannam@226 1509 static bool
cannam@226 1510 read_wrappedGraph(SerdReader* reader, ReadContext* ctx)
cannam@226 1511 {
cannam@226 1512 bool ate_dot = false;
cannam@226 1513 char s_type = 0;
cannam@226 1514 TRY_RET(eat_byte_check(reader, '{'));
cannam@226 1515 read_ws_star(reader);
cannam@226 1516 while (peek_byte(reader) != '}') {
cannam@226 1517 ctx->subject = 0;
cannam@226 1518 Ref subj = read_subject(reader, *ctx, &ctx->subject, &s_type);
cannam@226 1519 if (!subj ||
cannam@226 1520 (!read_triples(reader, *ctx, &ate_dot) && s_type != '[')) {
cannam@226 1521 return false;
cannam@226 1522 }
cannam@226 1523 pop_node(reader, subj);
cannam@226 1524 read_ws_star(reader);
cannam@226 1525 if (peek_byte(reader) == '.') {
cannam@226 1526 eat_byte_safe(reader, '.');
cannam@226 1527 }
cannam@226 1528 read_ws_star(reader);
cannam@226 1529 }
cannam@226 1530 return eat_byte_check(reader, '}');
cannam@226 1531 }
cannam@226 1532
cannam@226 1533 static int
cannam@226 1534 tokcmp(SerdReader* reader, Ref ref, const char* tok, size_t n)
cannam@226 1535 {
cannam@226 1536 SerdNode* node = deref(reader, ref);
cannam@226 1537 if (!node || node->n_bytes != n) {
cannam@226 1538 return -1;
cannam@226 1539 }
cannam@226 1540 const char* s1 = (const char*)node->buf;
cannam@226 1541 const char* s2 = tok;
cannam@226 1542 for (; n > 0 && *s2; s1++, s2++, --n) {
cannam@226 1543 if (toupper(*s1) != toupper(*s2)) {
cannam@226 1544 return ((*(uint8_t*)s1 < *(uint8_t*)s2) ? -1 : +1);
cannam@226 1545 }
cannam@226 1546 }
cannam@226 1547 return 0;
cannam@226 1548 }
cannam@226 1549
cannam@226 1550 static bool
cannam@226 1551 read_statement(SerdReader* reader)
cannam@226 1552 {
cannam@226 1553 SerdStatementFlags flags = 0;
cannam@226 1554 ReadContext ctx = { 0, 0, 0, 0, 0, 0, &flags };
cannam@226 1555 Ref subj = 0;
cannam@226 1556 bool ate_dot = false;
cannam@226 1557 char s_type = false;
cannam@226 1558 bool ret = true;
cannam@226 1559 read_ws_star(reader);
cannam@226 1560 switch (peek_byte(reader)) {
cannam@226 1561 case '\0':
cannam@226 1562 reader->eof = true;
cannam@226 1563 return reader->status <= SERD_FAILURE;
cannam@226 1564 case '@':
cannam@226 1565 TRY_RET(read_directive(reader));
cannam@226 1566 read_ws_star(reader);
cannam@226 1567 break;
cannam@226 1568 case '{':
cannam@226 1569 if (reader->syntax == SERD_TRIG) {
cannam@226 1570 TRY_RET(read_wrappedGraph(reader, &ctx));
cannam@226 1571 read_ws_star(reader);
cannam@226 1572 } else {
cannam@226 1573 return r_err(reader, SERD_ERR_BAD_SYNTAX, "graph in Turtle\n");
cannam@226 1574 }
cannam@226 1575 break;
cannam@226 1576 default:
cannam@226 1577 subj = read_subject(reader, ctx, &ctx.subject, &s_type);
cannam@226 1578 if (!tokcmp(reader, ctx.subject, "base", 4)) {
cannam@226 1579 ret = read_base(reader, true, false);
cannam@226 1580 } else if (!tokcmp(reader, ctx.subject, "prefix", 6)) {
cannam@226 1581 ret = read_prefixID(reader, true, false);
cannam@226 1582 } else if (!tokcmp(reader, ctx.subject, "graph", 5)) {
cannam@226 1583 read_ws_star(reader);
cannam@226 1584 TRY_RET((ctx.graph = read_labelOrSubject(reader, ctx)));
cannam@226 1585 read_ws_star(reader);
cannam@226 1586 TRY_RET(read_wrappedGraph(reader, &ctx));
cannam@226 1587 read_ws_star(reader);
cannam@226 1588 } else if (read_ws_star(reader) && peek_byte(reader) == '{') {
cannam@226 1589 if (s_type == '(' || (s_type == '[' && !*ctx.flags)) {
cannam@226 1590 return false; // invalid graph with complex label
cannam@226 1591 }
cannam@226 1592 ctx.graph = subj;
cannam@226 1593 ctx.subject = subj = 0;
cannam@226 1594 TRY_RET(read_wrappedGraph(reader, &ctx));
cannam@226 1595 read_ws_star(reader);
cannam@226 1596 } else if (!subj) {
cannam@226 1597 ret = r_err(reader, SERD_ERR_BAD_SYNTAX, "bad subject\n");
cannam@226 1598 } else if (!read_triples(reader, ctx, &ate_dot)) {
cannam@226 1599 ret = (s_type == '[');
cannam@226 1600 } else if (!ate_dot) {
cannam@226 1601 read_ws_star(reader);
cannam@226 1602 ret = (eat_byte_check(reader, '.') == '.');
cannam@226 1603 }
cannam@226 1604 pop_node(reader, subj);
cannam@226 1605 break;
cannam@226 1606 }
cannam@226 1607 return ret;
cannam@226 1608 }
cannam@226 1609
cannam@226 1610 static bool
cannam@226 1611 read_turtleDoc(SerdReader* reader)
cannam@226 1612 {
cannam@226 1613 while (!reader->eof) {
cannam@226 1614 TRY_RET(read_statement(reader));
cannam@226 1615 }
cannam@226 1616 return reader->status <= SERD_FAILURE;
cannam@226 1617 }
cannam@226 1618
cannam@226 1619 static bool
cannam@226 1620 read_trigDoc(SerdReader* reader)
cannam@226 1621 {
cannam@226 1622 while (!reader->eof) {
cannam@226 1623 TRY_RET(read_statement(reader));
cannam@226 1624 }
cannam@226 1625 return reader->status <= SERD_FAILURE;
cannam@226 1626 }
cannam@226 1627
cannam@226 1628 static bool
cannam@226 1629 read_nquadsDoc(SerdReader* reader)
cannam@226 1630 {
cannam@226 1631 while (!reader->eof) {
cannam@226 1632 SerdStatementFlags flags = 0;
cannam@226 1633 ReadContext ctx = { 0, 0, 0, 0, 0, 0, &flags };
cannam@226 1634 bool ate_dot = false;
cannam@226 1635 char s_type = false;
cannam@226 1636 read_ws_star(reader);
cannam@226 1637 if (peek_byte(reader) == '\0') {
cannam@226 1638 reader->eof = true;
cannam@226 1639 break;
cannam@226 1640 }
cannam@226 1641
cannam@226 1642 // subject predicate object
cannam@226 1643 if (!(ctx.subject = read_subject(reader, ctx, &ctx.subject, &s_type)) ||
cannam@226 1644 !read_ws_star(reader) ||
cannam@226 1645 !(ctx.predicate = read_IRIREF(reader)) ||
cannam@226 1646 !read_ws_star(reader) ||
cannam@226 1647 !read_object(reader, &ctx, false, &ate_dot)) {
cannam@226 1648 return false;
cannam@226 1649 }
cannam@226 1650
cannam@226 1651 if (!ate_dot) { // graphLabel?
cannam@226 1652 TRY_RET(read_ws_star(reader));
cannam@226 1653 switch (peek_byte(reader)) {
cannam@226 1654 case '.':
cannam@226 1655 break;
cannam@226 1656 case '_':
cannam@226 1657 ctx.graph = read_BLANK_NODE_LABEL(reader, &ate_dot);
cannam@226 1658 break;
cannam@226 1659 default:
cannam@226 1660 if (!(ctx.graph = read_IRIREF(reader))) {
cannam@226 1661 return false;
cannam@226 1662 }
cannam@226 1663 }
cannam@226 1664
cannam@226 1665 // Terminating '.'
cannam@226 1666 TRY_RET(read_ws_star(reader));
cannam@226 1667 eat_byte_check(reader, '.');
cannam@226 1668 }
cannam@226 1669
cannam@226 1670 TRY_RET(emit_statement(reader, ctx, ctx.object, ctx.datatype, ctx.lang));
cannam@226 1671 pop_node(reader, ctx.graph);
cannam@226 1672 pop_node(reader, ctx.lang);
cannam@226 1673 pop_node(reader, ctx.datatype);
cannam@226 1674 pop_node(reader, ctx.object);
cannam@226 1675 }
cannam@226 1676 return reader->status <= SERD_FAILURE;
cannam@226 1677 }
cannam@226 1678
cannam@226 1679 static bool
cannam@226 1680 read_doc(SerdReader* reader)
cannam@226 1681 {
cannam@226 1682 switch (reader->syntax) {
cannam@226 1683 case SERD_NQUADS: return read_nquadsDoc(reader);
cannam@226 1684 case SERD_TRIG: return read_trigDoc(reader);
cannam@226 1685 default: return read_turtleDoc(reader);
cannam@226 1686 }
cannam@226 1687 }
cannam@226 1688
cannam@226 1689 SERD_API
cannam@226 1690 SerdReader*
cannam@226 1691 serd_reader_new(SerdSyntax syntax,
cannam@226 1692 void* handle,
cannam@226 1693 void (*free_handle)(void*),
cannam@226 1694 SerdBaseSink base_sink,
cannam@226 1695 SerdPrefixSink prefix_sink,
cannam@226 1696 SerdStatementSink statement_sink,
cannam@226 1697 SerdEndSink end_sink)
cannam@226 1698 {
cannam@226 1699 const Cursor cur = { NULL, 0, 0 };
cannam@226 1700 SerdReader* me = (SerdReader*)calloc(1, sizeof(SerdReader));
cannam@226 1701 me->handle = handle;
cannam@226 1702 me->free_handle = free_handle;
cannam@226 1703 me->base_sink = base_sink;
cannam@226 1704 me->prefix_sink = prefix_sink;
cannam@226 1705 me->statement_sink = statement_sink;
cannam@226 1706 me->end_sink = end_sink;
cannam@226 1707 me->default_graph = SERD_NODE_NULL;
cannam@226 1708 me->stack = serd_stack_new(SERD_PAGE_SIZE);
cannam@226 1709 me->syntax = syntax;
cannam@226 1710 me->cur = cur;
cannam@226 1711 me->next_id = 1;
cannam@226 1712
cannam@226 1713 me->rdf_first = push_node(me, SERD_URI, NS_RDF "first", 48);
cannam@226 1714 me->rdf_rest = push_node(me, SERD_URI, NS_RDF "rest", 47);
cannam@226 1715 me->rdf_nil = push_node(me, SERD_URI, NS_RDF "nil", 46);
cannam@226 1716
cannam@226 1717 return me;
cannam@226 1718 }
cannam@226 1719
cannam@226 1720 SERD_API
cannam@226 1721 void
cannam@226 1722 serd_reader_set_strict(SerdReader* reader, bool strict)
cannam@226 1723 {
cannam@226 1724 reader->strict = strict;
cannam@226 1725 }
cannam@226 1726
cannam@226 1727 SERD_API
cannam@226 1728 void
cannam@226 1729 serd_reader_set_error_sink(SerdReader* reader,
cannam@226 1730 SerdErrorSink error_sink,
cannam@226 1731 void* error_handle)
cannam@226 1732 {
cannam@226 1733 reader->error_sink = error_sink;
cannam@226 1734 reader->error_handle = error_handle;
cannam@226 1735 }
cannam@226 1736
cannam@226 1737 SERD_API
cannam@226 1738 void
cannam@226 1739 serd_reader_free(SerdReader* reader)
cannam@226 1740 {
cannam@226 1741 pop_node(reader, reader->rdf_nil);
cannam@226 1742 pop_node(reader, reader->rdf_rest);
cannam@226 1743 pop_node(reader, reader->rdf_first);
cannam@226 1744 serd_node_free(&reader->default_graph);
cannam@226 1745
cannam@226 1746 #ifdef SERD_STACK_CHECK
cannam@226 1747 free(reader->allocs);
cannam@226 1748 #endif
cannam@226 1749 free(reader->stack.buf);
cannam@226 1750 free(reader->bprefix);
cannam@226 1751 if (reader->free_handle) {
cannam@226 1752 reader->free_handle(reader->handle);
cannam@226 1753 }
cannam@226 1754 free(reader);
cannam@226 1755 }
cannam@226 1756
cannam@226 1757 SERD_API
cannam@226 1758 void*
cannam@226 1759 serd_reader_get_handle(const SerdReader* reader)
cannam@226 1760 {
cannam@226 1761 return reader->handle;
cannam@226 1762 }
cannam@226 1763
cannam@226 1764 SERD_API
cannam@226 1765 void
cannam@226 1766 serd_reader_add_blank_prefix(SerdReader* reader,
cannam@226 1767 const uint8_t* prefix)
cannam@226 1768 {
cannam@226 1769 free(reader->bprefix);
cannam@226 1770 reader->bprefix_len = 0;
cannam@226 1771 reader->bprefix = NULL;
cannam@226 1772 if (prefix) {
cannam@226 1773 reader->bprefix_len = strlen((const char*)prefix);
cannam@226 1774 reader->bprefix = (uint8_t*)malloc(reader->bprefix_len + 1);
cannam@226 1775 memcpy(reader->bprefix, prefix, reader->bprefix_len + 1);
cannam@226 1776 }
cannam@226 1777 }
cannam@226 1778
cannam@226 1779 SERD_API
cannam@226 1780 void
cannam@226 1781 serd_reader_set_default_graph(SerdReader* reader,
cannam@226 1782 const SerdNode* graph)
cannam@226 1783 {
cannam@226 1784 serd_node_free(&reader->default_graph);
cannam@226 1785 reader->default_graph = serd_node_copy(graph);
cannam@226 1786 }
cannam@226 1787
cannam@226 1788 SERD_API
cannam@226 1789 SerdStatus
cannam@226 1790 serd_reader_read_file(SerdReader* reader,
cannam@226 1791 const uint8_t* uri)
cannam@226 1792 {
cannam@226 1793 uint8_t* const path = serd_file_uri_parse(uri, NULL);
cannam@226 1794 if (!path) {
cannam@226 1795 return SERD_ERR_BAD_ARG;
cannam@226 1796 }
cannam@226 1797
cannam@226 1798 FILE* fd = serd_fopen((const char*)path, "r");
cannam@226 1799 if (!fd) {
cannam@226 1800 free(path);
cannam@226 1801 return SERD_ERR_UNKNOWN;
cannam@226 1802 }
cannam@226 1803
cannam@226 1804 SerdStatus ret = serd_reader_read_file_handle(reader, fd, path);
cannam@226 1805 fclose(fd);
cannam@226 1806 free(path);
cannam@226 1807 return ret;
cannam@226 1808 }
cannam@226 1809
cannam@226 1810 static bool
cannam@226 1811 skip_bom(SerdReader* me)
cannam@226 1812 {
cannam@226 1813 if (peek_byte(me) == 0xEF) {
cannam@226 1814 eat_byte_safe(me, 0xEF);
cannam@226 1815 if (eat_byte_check(me, 0xBB) != 0xBB ||
cannam@226 1816 eat_byte_check(me, 0xBF) != 0xBF) {
cannam@226 1817 return r_err(me, SERD_ERR_BAD_SYNTAX, "corrupt byte order mark\n");
cannam@226 1818 }
cannam@226 1819 }
cannam@226 1820
cannam@226 1821 return true;
cannam@226 1822 }
cannam@226 1823
cannam@226 1824 SERD_API
cannam@226 1825 SerdStatus
cannam@226 1826 serd_reader_start_stream(SerdReader* me,
cannam@226 1827 FILE* file,
cannam@226 1828 const uint8_t* name,
cannam@226 1829 bool bulk)
cannam@226 1830 {
cannam@226 1831 return serd_reader_start_source_stream(
cannam@226 1832 me,
cannam@226 1833 bulk ? (SerdSource)fread : serd_file_read_byte,
cannam@226 1834 (SerdStreamErrorFunc)ferror,
cannam@226 1835 file,
cannam@226 1836 name,
cannam@226 1837 bulk ? SERD_PAGE_SIZE : 1);
cannam@226 1838 }
cannam@226 1839
cannam@226 1840 SERD_API
cannam@226 1841 SerdStatus
cannam@226 1842 serd_reader_start_source_stream(SerdReader* me,
cannam@226 1843 SerdSource read_func,
cannam@226 1844 SerdStreamErrorFunc error_func,
cannam@226 1845 void* stream,
cannam@226 1846 const uint8_t* name,
cannam@226 1847 size_t page_size)
cannam@226 1848 {
cannam@226 1849 const Cursor cur = { name, 1, 1 };
cannam@226 1850 me->cur = cur;
cannam@226 1851
cannam@226 1852 return serd_byte_source_open_source(
cannam@226 1853 &me->source, read_func, error_func, stream, page_size);
cannam@226 1854 }
cannam@226 1855
cannam@226 1856 static SerdStatus
cannam@226 1857 serd_reader_prepare(SerdReader* me)
cannam@226 1858 {
cannam@226 1859 me->eof = false;
cannam@226 1860 if ((me->status = serd_byte_source_prepare(&me->source))) {
cannam@226 1861 r_err(me, me->status, "read error: %s\n", strerror(errno));
cannam@226 1862 } else if (!skip_bom(me)) {
cannam@226 1863 me->status = SERD_ERR_BAD_SYNTAX;
cannam@226 1864 }
cannam@226 1865 return me->status;
cannam@226 1866 }
cannam@226 1867
cannam@226 1868 SERD_API
cannam@226 1869 SerdStatus
cannam@226 1870 serd_reader_read_chunk(SerdReader* me)
cannam@226 1871 {
cannam@226 1872 SerdStatus st = SERD_SUCCESS;
cannam@226 1873 if (!me->source.prepared) {
cannam@226 1874 if ((st = serd_reader_prepare(me))) {
cannam@226 1875 return st;
cannam@226 1876 }
cannam@226 1877 } else if (me->eof) {
cannam@226 1878 me->eof = false;
cannam@226 1879 if ((st = serd_byte_source_advance(&me->source))) {
cannam@226 1880 return st;
cannam@226 1881 }
cannam@226 1882 }
cannam@226 1883 return read_statement(me) ? SERD_SUCCESS : SERD_FAILURE;
cannam@226 1884 }
cannam@226 1885
cannam@226 1886 SERD_API
cannam@226 1887 SerdStatus
cannam@226 1888 serd_reader_end_stream(SerdReader* me)
cannam@226 1889 {
cannam@226 1890 return serd_byte_source_close(&me->source);
cannam@226 1891 }
cannam@226 1892
cannam@226 1893 SERD_API
cannam@226 1894 SerdStatus
cannam@226 1895 serd_reader_read_file_handle(SerdReader* me, FILE* file, const uint8_t* name)
cannam@226 1896 {
cannam@226 1897 return serd_reader_read_source(
cannam@226 1898 me, (SerdSource)fread, (SerdStreamErrorFunc)ferror,
cannam@226 1899 file, name, SERD_PAGE_SIZE);
cannam@226 1900 }
cannam@226 1901
cannam@226 1902 SERD_API
cannam@226 1903 SerdStatus
cannam@226 1904 serd_reader_read_source(SerdReader* me,
cannam@226 1905 SerdSource source,
cannam@226 1906 SerdStreamErrorFunc error,
cannam@226 1907 void* stream,
cannam@226 1908 const uint8_t* name,
cannam@226 1909 size_t page_size)
cannam@226 1910 {
cannam@226 1911 SerdStatus st = serd_reader_start_source_stream(
cannam@226 1912 me, source, error, stream, name, page_size);
cannam@226 1913
cannam@226 1914 if ((st = serd_reader_prepare(me))) {
cannam@226 1915 serd_reader_end_stream(me);
cannam@226 1916 return st;
cannam@226 1917 } else if (!read_doc(me)) {
cannam@226 1918 serd_reader_end_stream(me);
cannam@226 1919 return SERD_ERR_UNKNOWN;
cannam@226 1920 }
cannam@226 1921
cannam@226 1922 return serd_reader_end_stream(me);
cannam@226 1923 }
cannam@226 1924
cannam@226 1925 SERD_API
cannam@226 1926 SerdStatus
cannam@226 1927 serd_reader_read_string(SerdReader* me, const uint8_t* utf8)
cannam@226 1928 {
cannam@226 1929 const Cursor cur = { (const uint8_t*)"(string)", 1, 1 };
cannam@226 1930
cannam@226 1931 serd_byte_source_open_string(&me->source, utf8);
cannam@226 1932 me->cur = cur;
cannam@226 1933 me->eof = false;
cannam@226 1934
cannam@226 1935 SerdStatus st = serd_reader_prepare(me);
cannam@226 1936 if (!st) {
cannam@226 1937 st = read_doc(me) ? SERD_SUCCESS : SERD_ERR_UNKNOWN;
cannam@226 1938 }
cannam@226 1939
cannam@226 1940 serd_byte_source_close(&me->source);
cannam@226 1941
cannam@226 1942 return st;
cannam@226 1943 }