cannam@226: /* cannam@226: Copyright 2011-2017 David Robillard cannam@226: cannam@226: Permission to use, copy, modify, and/or distribute this software for any cannam@226: purpose with or without fee is hereby granted, provided that the above cannam@226: copyright notice and this permission notice appear in all copies. cannam@226: cannam@226: THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES cannam@226: WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF cannam@226: MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR cannam@226: ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES cannam@226: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN cannam@226: ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF cannam@226: OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. cannam@226: */ cannam@226: cannam@226: #include "serd_internal.h" cannam@226: cannam@226: #include cannam@226: #include cannam@226: #include cannam@226: #include cannam@226: #include cannam@226: #include cannam@226: #include cannam@226: #include cannam@226: cannam@226: #define NS_XSD "http://www.w3.org/2001/XMLSchema#" cannam@226: #define NS_RDF "http://www.w3.org/1999/02/22-rdf-syntax-ns#" cannam@226: cannam@226: #define TRY_THROW(exp) if (!(exp)) goto except; cannam@226: #define TRY_RET(exp) if (!(exp)) return 0; cannam@226: cannam@226: #ifdef SERD_STACK_CHECK cannam@226: # define SERD_STACK_ASSERT_TOP(reader, ref) \ cannam@226: assert(ref == reader->allocs[reader->n_allocs - 1]); cannam@226: #else cannam@226: # define SERD_STACK_ASSERT_TOP(reader, ref) cannam@226: #endif cannam@226: cannam@226: typedef struct { cannam@226: const uint8_t* filename; cannam@226: unsigned line; cannam@226: unsigned col; cannam@226: } Cursor; cannam@226: cannam@226: typedef uint32_t uchar; cannam@226: cannam@226: /* Reference to a node in the stack (we can not use pointers since the cannam@226: stack may be reallocated, invalidating any pointers to elements). cannam@226: */ cannam@226: typedef size_t Ref; cannam@226: cannam@226: typedef struct { cannam@226: Ref graph; cannam@226: Ref subject; cannam@226: Ref predicate; cannam@226: Ref object; cannam@226: Ref datatype; cannam@226: Ref lang; cannam@226: SerdStatementFlags* flags; cannam@226: } ReadContext; cannam@226: cannam@226: struct SerdReaderImpl { cannam@226: void* handle; cannam@226: void (*free_handle)(void* ptr); cannam@226: SerdBaseSink base_sink; cannam@226: SerdPrefixSink prefix_sink; cannam@226: SerdStatementSink statement_sink; cannam@226: SerdEndSink end_sink; cannam@226: SerdErrorSink error_sink; cannam@226: void* error_handle; cannam@226: Ref rdf_first; cannam@226: Ref rdf_rest; cannam@226: Ref rdf_nil; cannam@226: SerdNode default_graph; cannam@226: SerdByteSource source; cannam@226: SerdStack stack; cannam@226: SerdSyntax syntax; cannam@226: unsigned next_id; cannam@226: Cursor cur; cannam@226: SerdStatus status; cannam@226: uint8_t* buf; cannam@226: uint8_t* bprefix; cannam@226: size_t bprefix_len; cannam@226: bool strict; ///< True iff strict parsing cannam@226: bool eof; cannam@226: bool seen_genid; cannam@226: #ifdef SERD_STACK_CHECK cannam@226: Ref* allocs; ///< Stack of push offsets cannam@226: size_t n_allocs; ///< Number of stack pushes cannam@226: #endif cannam@226: }; cannam@226: cannam@226: static inline bool cannam@226: supports_fancy_literals(const SerdReader* reader) cannam@226: { cannam@226: return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG; cannam@226: } cannam@226: cannam@226: static inline bool cannam@226: supports_relative_iris(const SerdReader* reader) cannam@226: { cannam@226: return reader->syntax == SERD_TURTLE || reader->syntax == SERD_TRIG; cannam@226: } cannam@226: cannam@226: static int cannam@226: r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...) cannam@226: { cannam@226: va_list args; cannam@226: va_start(args, fmt); cannam@226: const SerdError e = { cannam@226: st, reader->cur.filename, reader->cur.line, reader->cur.col, fmt, &args cannam@226: }; cannam@226: serd_error(reader->error_sink, reader->error_handle, &e); cannam@226: va_end(args); cannam@226: return 0; cannam@226: } cannam@226: cannam@226: /** fread-like wrapper for getc (which is faster). */ cannam@226: static size_t cannam@226: serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream) cannam@226: { cannam@226: const int c = getc((FILE*)stream); cannam@226: if (c == EOF) { cannam@226: *((uint8_t*)buf) = 0; cannam@226: return 0; cannam@226: } cannam@226: *((uint8_t*)buf) = (uint8_t)c; cannam@226: return 1; cannam@226: } cannam@226: cannam@226: static inline uint8_t cannam@226: peek_byte(SerdReader* reader) cannam@226: { cannam@226: return serd_byte_source_peek(&reader->source); cannam@226: } cannam@226: cannam@226: static inline uint8_t cannam@226: eat_byte_safe(SerdReader* reader, const uint8_t byte) cannam@226: { cannam@226: assert(peek_byte(reader) == byte); cannam@226: switch (byte) { cannam@226: case '\0': reader->eof = (byte != '\0'); break; cannam@226: case '\n': ++reader->cur.line; reader->cur.col = 0; break; cannam@226: default: ++reader->cur.col; cannam@226: } cannam@226: cannam@226: reader->status = serd_byte_source_advance(&reader->source); cannam@226: return byte; cannam@226: } cannam@226: cannam@226: static inline uint8_t cannam@226: eat_byte_check(SerdReader* reader, const uint8_t byte) cannam@226: { cannam@226: const uint8_t c = peek_byte(reader); cannam@226: if (c != byte) { cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "expected `%c', not `%c'\n", byte, c); cannam@226: } cannam@226: return eat_byte_safe(reader, byte); cannam@226: } cannam@226: cannam@226: static inline bool cannam@226: eat_string(SerdReader* reader, const char* str, unsigned n) cannam@226: { cannam@226: bool bad = false; cannam@226: for (unsigned i = 0; i < n; ++i) { cannam@226: bad |= eat_byte_check(reader, ((const uint8_t*)str)[i]); cannam@226: } cannam@226: return bad; cannam@226: } cannam@226: cannam@226: static Ref cannam@226: push_node_padded(SerdReader* reader, size_t maxlen, cannam@226: SerdType type, const char* str, size_t n_bytes) cannam@226: { cannam@226: void* mem = serd_stack_push_aligned( cannam@226: &reader->stack, sizeof(SerdNode) + maxlen + 1, sizeof(SerdNode)); cannam@226: cannam@226: SerdNode* const node = (SerdNode*)mem; cannam@226: node->n_bytes = node->n_chars = n_bytes; cannam@226: node->flags = 0; cannam@226: node->type = type; cannam@226: node->buf = NULL; cannam@226: cannam@226: uint8_t* buf = (uint8_t*)(node + 1); cannam@226: memcpy(buf, str, n_bytes + 1); cannam@226: cannam@226: #ifdef SERD_STACK_CHECK cannam@226: reader->allocs = realloc( cannam@226: reader->allocs, sizeof(uint8_t*) * (++reader->n_allocs)); cannam@226: reader->allocs[reader->n_allocs - 1] = ((uint8_t*)mem - reader->stack.buf); cannam@226: #endif cannam@226: return (uint8_t*)node - reader->stack.buf; cannam@226: } cannam@226: cannam@226: static Ref cannam@226: push_node(SerdReader* reader, SerdType type, const char* str, size_t n_bytes) cannam@226: { cannam@226: return push_node_padded(reader, n_bytes, type, str, n_bytes); cannam@226: } cannam@226: cannam@226: static inline SerdNode* cannam@226: deref(SerdReader* reader, const Ref ref) cannam@226: { cannam@226: if (ref) { cannam@226: SerdNode* node = (SerdNode*)(reader->stack.buf + ref); cannam@226: node->buf = (uint8_t*)node + sizeof(SerdNode); cannam@226: return node; cannam@226: } cannam@226: return NULL; cannam@226: } cannam@226: cannam@226: static inline void cannam@226: push_byte(SerdReader* reader, Ref ref, const uint8_t c) cannam@226: { cannam@226: SERD_STACK_ASSERT_TOP(reader, ref); cannam@226: uint8_t* const s = serd_stack_push(&reader->stack, 1); cannam@226: SerdNode* const node = (SerdNode*)(reader->stack.buf + ref); cannam@226: ++node->n_bytes; cannam@226: if (!(c & 0x80)) { // Starts with 0 bit, start of new character cannam@226: ++node->n_chars; cannam@226: } cannam@226: *(s - 1) = c; cannam@226: *s = '\0'; cannam@226: } cannam@226: cannam@226: static inline void cannam@226: push_replacement(SerdReader* reader, Ref dest) cannam@226: { cannam@226: push_byte(reader, dest, 0xEF); cannam@226: push_byte(reader, dest, 0xBF); cannam@226: push_byte(reader, dest, 0xBD); cannam@226: } cannam@226: cannam@226: static Ref cannam@226: pop_node(SerdReader* reader, Ref ref) cannam@226: { cannam@226: if (ref && ref != reader->rdf_first && ref != reader->rdf_rest cannam@226: && ref != reader->rdf_nil) { cannam@226: #ifdef SERD_STACK_CHECK cannam@226: SERD_STACK_ASSERT_TOP(reader, ref); cannam@226: --reader->n_allocs; cannam@226: #endif cannam@226: SerdNode* const node = deref(reader, ref); cannam@226: uint8_t* const top = reader->stack.buf + reader->stack.size; cannam@226: serd_stack_pop_aligned(&reader->stack, top - (uint8_t*)node); cannam@226: } cannam@226: return 0; cannam@226: } cannam@226: cannam@226: static inline bool cannam@226: emit_statement(SerdReader* reader, ReadContext ctx, Ref o, Ref d, Ref l) cannam@226: { cannam@226: SerdNode* graph = deref(reader, ctx.graph); cannam@226: if (!graph && reader->default_graph.buf) { cannam@226: graph = &reader->default_graph; cannam@226: } cannam@226: bool ret = !reader->statement_sink || cannam@226: !reader->statement_sink( cannam@226: reader->handle, *ctx.flags, graph, cannam@226: deref(reader, ctx.subject), deref(reader, ctx.predicate), cannam@226: deref(reader, o), deref(reader, d), deref(reader, l)); cannam@226: *ctx.flags &= SERD_ANON_CONT|SERD_LIST_CONT; // Preserve only cont flags cannam@226: return ret; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_collection(SerdReader* reader, ReadContext ctx, Ref* dest); cannam@226: cannam@226: static bool cannam@226: read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot); cannam@226: cannam@226: static inline uint8_t cannam@226: read_HEX(SerdReader* reader) cannam@226: { cannam@226: const uint8_t c = peek_byte(reader); cannam@226: if (is_digit(c) || in_range(c, 'A', 'F') || in_range(c, 'a', 'f')) { cannam@226: return eat_byte_safe(reader, c); cannam@226: } else { cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "invalid hexadecimal digit `%c'\n", c); cannam@226: } cannam@226: } cannam@226: cannam@226: // Read UCHAR escape, initial \ is already eaten by caller cannam@226: static inline bool cannam@226: read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code) cannam@226: { cannam@226: const uint8_t b = peek_byte(reader); cannam@226: unsigned length = 0; cannam@226: switch (b) { cannam@226: case 'U': cannam@226: length = 8; cannam@226: break; cannam@226: case 'u': cannam@226: length = 4; cannam@226: break; cannam@226: default: cannam@226: return false; cannam@226: } cannam@226: eat_byte_safe(reader, b); cannam@226: cannam@226: uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; cannam@226: for (unsigned i = 0; i < length; ++i) { cannam@226: if (!(buf[i] = read_HEX(reader))) { cannam@226: return false; cannam@226: } cannam@226: } cannam@226: cannam@226: uint32_t code; cannam@226: sscanf((const char*)buf, "%X", &code); cannam@226: cannam@226: unsigned size = 0; cannam@226: if (code < 0x00000080) { cannam@226: size = 1; cannam@226: } else if (code < 0x00000800) { cannam@226: size = 2; cannam@226: } else if (code < 0x00010000) { cannam@226: size = 3; cannam@226: } else if (code < 0x00110000) { cannam@226: size = 4; cannam@226: } else { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "unicode character 0x%X out of range\n", code); cannam@226: push_replacement(reader, dest); cannam@226: *char_code = 0xFFFD; cannam@226: return true; cannam@226: } cannam@226: cannam@226: // Build output in buf cannam@226: // (Note # of bytes = # of leading 1 bits in first byte) cannam@226: uint32_t c = code; cannam@226: switch (size) { cannam@226: case 4: cannam@226: buf[3] = 0x80 | (uint8_t)(c & 0x3F); cannam@226: c >>= 6; cannam@226: c |= (16 << 12); // set bit 4 cannam@226: case 3: cannam@226: buf[2] = 0x80 | (uint8_t)(c & 0x3F); cannam@226: c >>= 6; cannam@226: c |= (32 << 6); // set bit 5 cannam@226: case 2: cannam@226: buf[1] = 0x80 | (uint8_t)(c & 0x3F); cannam@226: c >>= 6; cannam@226: c |= 0xC0; // set bits 6 and 7 cannam@226: case 1: cannam@226: buf[0] = (uint8_t)c; cannam@226: } cannam@226: cannam@226: for (unsigned i = 0; i < size; ++i) { cannam@226: push_byte(reader, dest, buf[i]); cannam@226: } cannam@226: *char_code = code; cannam@226: return true; cannam@226: } cannam@226: cannam@226: // Read ECHAR escape, initial \ is already eaten by caller cannam@226: static inline bool cannam@226: read_ECHAR(SerdReader* reader, Ref dest, SerdNodeFlags* flags) cannam@226: { cannam@226: const uint8_t c = peek_byte(reader); cannam@226: switch (c) { cannam@226: case 't': cannam@226: eat_byte_safe(reader, 't'); cannam@226: push_byte(reader, dest, '\t'); cannam@226: return true; cannam@226: case 'b': cannam@226: eat_byte_safe(reader, 'b'); cannam@226: push_byte(reader, dest, '\b'); cannam@226: return true; cannam@226: case 'n': cannam@226: *flags |= SERD_HAS_NEWLINE; cannam@226: eat_byte_safe(reader, 'n'); cannam@226: push_byte(reader, dest, '\n'); cannam@226: return true; cannam@226: case 'r': cannam@226: *flags |= SERD_HAS_NEWLINE; cannam@226: eat_byte_safe(reader, 'r'); cannam@226: push_byte(reader, dest, '\r'); cannam@226: return true; cannam@226: case 'f': cannam@226: eat_byte_safe(reader, 'f'); cannam@226: push_byte(reader, dest, '\f'); cannam@226: return true; cannam@226: case '\\': case '"': case '\'': cannam@226: push_byte(reader, dest, eat_byte_safe(reader, c)); cannam@226: return true; cannam@226: default: cannam@226: return false; cannam@226: } cannam@226: } cannam@226: cannam@226: static inline SerdStatus cannam@226: bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c) cannam@226: { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c); cannam@226: push_replacement(reader, dest); cannam@226: cannam@226: // Skip bytes until the next start byte cannam@226: for (uint8_t b = peek_byte(reader); (b & 0x80);) { cannam@226: eat_byte_safe(reader, b); cannam@226: b = peek_byte(reader); cannam@226: } cannam@226: cannam@226: return SERD_SUCCESS; cannam@226: } cannam@226: cannam@226: static SerdStatus cannam@226: read_utf8_character(SerdReader* reader, Ref dest, uint8_t c) cannam@226: { cannam@226: unsigned size = 1; cannam@226: if ((c & 0xE0) == 0xC0) { // Starts with `110' cannam@226: size = 2; cannam@226: } else if ((c & 0xF0) == 0xE0) { // Starts with `1110' cannam@226: size = 3; cannam@226: } else if ((c & 0xF8) == 0xF0) { // Starts with `11110' cannam@226: size = 4; cannam@226: } else { cannam@226: return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", c); cannam@226: } cannam@226: cannam@226: char bytes[4]; cannam@226: bytes[0] = c; cannam@226: cannam@226: // Check character validity cannam@226: for (unsigned i = 1; i < size; ++i) { cannam@226: if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) { cannam@226: return bad_char(reader, dest, "invalid UTF-8 continuation 0x%X\n", cannam@226: bytes[i]); cannam@226: } cannam@226: eat_byte_safe(reader, bytes[i]); cannam@226: } cannam@226: cannam@226: // Emit character cannam@226: for (unsigned i = 0; i < size; ++i) { cannam@226: push_byte(reader, dest, bytes[i]); cannam@226: } cannam@226: return SERD_SUCCESS; cannam@226: } cannam@226: cannam@226: // Read one character (possibly multi-byte) cannam@226: // The first byte, c, has already been eaten by caller cannam@226: static inline SerdStatus cannam@226: read_character(SerdReader* reader, Ref dest, SerdNodeFlags* flags, uint8_t c) cannam@226: { cannam@226: if (!(c & 0x80)) { cannam@226: switch (c) { cannam@226: case 0xA: case 0xD: cannam@226: *flags |= SERD_HAS_NEWLINE; cannam@226: break; cannam@226: case '"': case '\'': cannam@226: *flags |= SERD_HAS_QUOTE; cannam@226: break; cannam@226: } cannam@226: push_byte(reader, dest, c); cannam@226: return SERD_SUCCESS; cannam@226: } else { cannam@226: return read_utf8_character(reader, dest, c); cannam@226: } cannam@226: } cannam@226: cannam@226: // [10] comment ::= '#' ( [^#xA #xD] )* cannam@226: static void cannam@226: read_comment(SerdReader* reader) cannam@226: { cannam@226: eat_byte_safe(reader, '#'); cannam@226: uint8_t c; cannam@226: while (((c = peek_byte(reader)) != 0xA) && (c != 0xD) && c) { cannam@226: eat_byte_safe(reader, c); cannam@226: } cannam@226: } cannam@226: cannam@226: // [24] ws ::= #x9 | #xA | #xD | #x20 | comment cannam@226: static inline bool cannam@226: read_ws(SerdReader* reader) cannam@226: { cannam@226: const uint8_t c = peek_byte(reader); cannam@226: switch (c) { cannam@226: case 0x9: case 0xA: case 0xD: case 0x20: cannam@226: eat_byte_safe(reader, c); cannam@226: return true; cannam@226: case '#': cannam@226: read_comment(reader); cannam@226: return true; cannam@226: default: cannam@226: return false; cannam@226: } cannam@226: } cannam@226: cannam@226: static inline bool cannam@226: read_ws_star(SerdReader* reader) cannam@226: { cannam@226: while (read_ws(reader)) {} cannam@226: return true; cannam@226: } cannam@226: cannam@226: static inline bool cannam@226: peek_delim(SerdReader* reader, const char delim) cannam@226: { cannam@226: read_ws_star(reader); cannam@226: return peek_byte(reader) == delim; cannam@226: } cannam@226: cannam@226: static inline bool cannam@226: eat_delim(SerdReader* reader, const char delim) cannam@226: { cannam@226: if (peek_delim(reader, delim)) { cannam@226: eat_byte_safe(reader, delim); cannam@226: return read_ws_star(reader); cannam@226: } cannam@226: return false; cannam@226: } cannam@226: cannam@226: // STRING_LITERAL_LONG_QUOTE and STRING_LITERAL_LONG_SINGLE_QUOTE cannam@226: // Initial triple quotes are already eaten by caller cannam@226: static Ref cannam@226: read_STRING_LITERAL_LONG(SerdReader* reader, SerdNodeFlags* flags, uint8_t q) cannam@226: { cannam@226: Ref ref = push_node(reader, SERD_LITERAL, "", 0); cannam@226: while (true) { cannam@226: const uint8_t c = peek_byte(reader); cannam@226: uint32_t code; cannam@226: switch (c) { cannam@226: case '\\': cannam@226: eat_byte_safe(reader, c); cannam@226: if (!read_ECHAR(reader, ref, flags) && cannam@226: !read_UCHAR(reader, ref, &code)) { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "invalid escape `\\%c'\n", peek_byte(reader)); cannam@226: return pop_node(reader, ref); cannam@226: } cannam@226: break; cannam@226: default: cannam@226: if (c == q) { cannam@226: eat_byte_safe(reader, q); cannam@226: const uint8_t q2 = eat_byte_safe(reader, peek_byte(reader)); cannam@226: const uint8_t q3 = peek_byte(reader); cannam@226: if (q2 == q && q3 == q) { // End of string cannam@226: eat_byte_safe(reader, q3); cannam@226: return ref; cannam@226: } else { cannam@226: *flags |= SERD_HAS_QUOTE; cannam@226: push_byte(reader, ref, c); cannam@226: read_character(reader, ref, flags, q2); cannam@226: } cannam@226: } else { cannam@226: read_character(reader, ref, flags, eat_byte_safe(reader, c)); cannam@226: } cannam@226: } cannam@226: } cannam@226: return ref; cannam@226: } cannam@226: cannam@226: // STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE cannam@226: // Initial quote is already eaten by caller cannam@226: static Ref cannam@226: read_STRING_LITERAL(SerdReader* reader, SerdNodeFlags* flags, uint8_t q) cannam@226: { cannam@226: Ref ref = push_node(reader, SERD_LITERAL, "", 0); cannam@226: while (true) { cannam@226: const uint8_t c = peek_byte(reader); cannam@226: uint32_t code; cannam@226: switch (c) { cannam@226: case '\n': case '\r': cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string\n"); cannam@226: return pop_node(reader, ref); cannam@226: case '\\': cannam@226: eat_byte_safe(reader, c); cannam@226: if (!read_ECHAR(reader, ref, flags) && cannam@226: !read_UCHAR(reader, ref, &code)) { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "invalid escape `\\%c'\n", peek_byte(reader)); cannam@226: return pop_node(reader, ref); cannam@226: } cannam@226: break; cannam@226: default: cannam@226: if (c == q) { cannam@226: eat_byte_check(reader, q); cannam@226: return ref; cannam@226: } else { cannam@226: read_character(reader, ref, flags, eat_byte_safe(reader, c)); cannam@226: } cannam@226: } cannam@226: } cannam@226: eat_byte_check(reader, q); cannam@226: return ref; cannam@226: } cannam@226: cannam@226: static Ref cannam@226: read_String(SerdReader* reader, SerdNodeFlags* flags) cannam@226: { cannam@226: const uint8_t q1 = peek_byte(reader); cannam@226: eat_byte_safe(reader, q1); cannam@226: cannam@226: const uint8_t q2 = peek_byte(reader); cannam@226: if (q2 != q1) { // Short string (not triple quoted) cannam@226: return read_STRING_LITERAL(reader, flags, q1); cannam@226: } cannam@226: cannam@226: eat_byte_safe(reader, q2); cannam@226: const uint8_t q3 = peek_byte(reader); cannam@226: if (q3 != q1) { // Empty short string ("" or '') cannam@226: return push_node(reader, SERD_LITERAL, "", 0); cannam@226: } cannam@226: cannam@226: if (!supports_fancy_literals(reader)) { cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "syntax does not support long literals\n"); cannam@226: } cannam@226: cannam@226: eat_byte_safe(reader, q3); cannam@226: return read_STRING_LITERAL_LONG(reader, flags, q1); cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_PN_CHARS_BASE(SerdReader* reader, Ref dest) cannam@226: { cannam@226: const uint8_t c = peek_byte(reader); cannam@226: if ((c & 0x80)) { // Multi-byte character cannam@226: return !read_utf8_character(reader, dest, eat_byte_safe(reader, c)); cannam@226: } cannam@226: if (is_alpha(c)) { cannam@226: push_byte(reader, dest, eat_byte_safe(reader, c)); cannam@226: return true; cannam@226: } cannam@226: return false; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_PN_CHARS(SerdReader* reader, Ref dest) cannam@226: { cannam@226: const uint8_t c = peek_byte(reader); cannam@226: if ((c & 0x80)) { // Multi-byte character cannam@226: return !read_utf8_character(reader, dest, eat_byte_safe(reader, c)); cannam@226: } cannam@226: cannam@226: if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') { cannam@226: push_byte(reader, dest, eat_byte_safe(reader, c)); cannam@226: return true; cannam@226: } cannam@226: return false; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_PERCENT(SerdReader* reader, Ref dest) cannam@226: { cannam@226: push_byte(reader, dest, eat_byte_safe(reader, '%')); cannam@226: const uint8_t h1 = read_HEX(reader); cannam@226: const uint8_t h2 = read_HEX(reader); cannam@226: if (h1 && h2) { cannam@226: push_byte(reader, dest, h1); cannam@226: push_byte(reader, dest, h2); cannam@226: return true; cannam@226: } cannam@226: return false; cannam@226: } cannam@226: cannam@226: static SerdStatus cannam@226: read_PLX(SerdReader* reader, Ref dest) cannam@226: { cannam@226: uint8_t c = peek_byte(reader); cannam@226: switch (c) { cannam@226: case '%': cannam@226: if (!read_PERCENT(reader, dest)) { cannam@226: return SERD_ERR_BAD_SYNTAX; cannam@226: } cannam@226: return SERD_SUCCESS; cannam@226: case '\\': cannam@226: eat_byte_safe(reader, c); cannam@226: if (is_alpha(c = peek_byte(reader))) { cannam@226: // Escapes like \u \n etc. are not supported cannam@226: return SERD_ERR_BAD_SYNTAX; cannam@226: } else { cannam@226: // Allow escaping of pretty much any other character cannam@226: push_byte(reader, dest, eat_byte_safe(reader, c)); cannam@226: return SERD_SUCCESS; cannam@226: } cannam@226: default: cannam@226: return SERD_FAILURE; cannam@226: } cannam@226: } cannam@226: cannam@226: static SerdStatus cannam@226: read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot) cannam@226: { cannam@226: uint8_t c = peek_byte(reader); cannam@226: SerdStatus st; cannam@226: switch (c) { cannam@226: case '0': case '1': case '2': case '3': case '4': case '5': cannam@226: case '6': case '7': case '8': case '9': case ':': case '_': cannam@226: push_byte(reader, dest, eat_byte_safe(reader, c)); cannam@226: break; cannam@226: default: cannam@226: if ((st = read_PLX(reader, dest)) > SERD_FAILURE) { cannam@226: return st; cannam@226: } else if (st != SERD_SUCCESS && !read_PN_CHARS_BASE(reader, dest)) { cannam@226: return SERD_FAILURE; cannam@226: } cannam@226: } cannam@226: cannam@226: while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.' | ';')* cannam@226: if (c == '.' || c == ':') { cannam@226: push_byte(reader, dest, eat_byte_safe(reader, c)); cannam@226: } else if ((st = read_PLX(reader, dest)) > SERD_FAILURE) { cannam@226: return st; cannam@226: } else if (st != SERD_SUCCESS && !read_PN_CHARS(reader, dest)) { cannam@226: break; cannam@226: } cannam@226: } cannam@226: cannam@226: SerdNode* const n = deref(reader, dest); cannam@226: if (n->buf[n->n_bytes - 1] == '.') { cannam@226: // Ate trailing dot, pop it from stack/node and inform caller cannam@226: --n->n_bytes; cannam@226: serd_stack_pop(&reader->stack, 1); cannam@226: *ate_dot = true; cannam@226: } cannam@226: cannam@226: return SERD_SUCCESS; cannam@226: } cannam@226: cannam@226: // Read the remainder of a PN_PREFIX after some initial characters cannam@226: static SerdStatus cannam@226: read_PN_PREFIX_tail(SerdReader* reader, Ref dest) cannam@226: { cannam@226: uint8_t c; cannam@226: while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')* cannam@226: if (c == '.') { cannam@226: push_byte(reader, dest, eat_byte_safe(reader, c)); cannam@226: } else if (!read_PN_CHARS(reader, dest)) { cannam@226: break; cannam@226: } cannam@226: } cannam@226: cannam@226: const SerdNode* const n = deref(reader, dest); cannam@226: if (n->buf[n->n_bytes - 1] == '.' && !read_PN_CHARS(reader, dest)) { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, "prefix ends with `.'\n"); cannam@226: return SERD_ERR_BAD_SYNTAX; cannam@226: } cannam@226: cannam@226: return SERD_SUCCESS; cannam@226: } cannam@226: cannam@226: static SerdStatus cannam@226: read_PN_PREFIX(SerdReader* reader, Ref dest) cannam@226: { cannam@226: if (read_PN_CHARS_BASE(reader, dest)) { cannam@226: return read_PN_PREFIX_tail(reader, dest); cannam@226: } cannam@226: return SERD_FAILURE; cannam@226: } cannam@226: cannam@226: static Ref cannam@226: read_LANGTAG(SerdReader* reader) cannam@226: { cannam@226: uint8_t c = peek_byte(reader); cannam@226: if (!is_alpha(c)) { cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'\n", c); cannam@226: } cannam@226: Ref ref = push_node(reader, SERD_LITERAL, "", 0); cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: while ((c = peek_byte(reader)) && is_alpha(c)) { cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: } cannam@226: while (peek_byte(reader) == '-') { cannam@226: push_byte(reader, ref, eat_byte_safe(reader, '-')); cannam@226: while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) { cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: } cannam@226: } cannam@226: return ref; cannam@226: } cannam@226: cannam@226: typedef enum { PREFIX, GOOD, BAD} SchemeState; cannam@226: cannam@226: static inline bool cannam@226: check_scheme(SerdReader* reader, uint8_t c, SchemeState* state) cannam@226: { cannam@226: if (!supports_relative_iris(reader) && *state == PREFIX) { cannam@226: if (c == ':') { cannam@226: *state = GOOD; cannam@226: } else if (!isalpha(c)) { cannam@226: *state = BAD; cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "syntax does not support relative IRIs\n"); cannam@226: } cannam@226: } cannam@226: return true; cannam@226: } cannam@226: cannam@226: static Ref cannam@226: read_IRIREF(SerdReader* reader) cannam@226: { cannam@226: TRY_RET(eat_byte_check(reader, '<')); cannam@226: Ref ref = push_node(reader, SERD_URI, "", 0); cannam@226: SchemeState scheme = PREFIX; cannam@226: uint32_t code; cannam@226: while (true) { cannam@226: const uint8_t c = peek_byte(reader); cannam@226: if (!check_scheme(reader, c, &scheme)) { cannam@226: return pop_node(reader, ref); cannam@226: } cannam@226: switch (c) { cannam@226: case '"': case '<': case '^': case '`': case '{': case '|': case '}': cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "invalid IRI character `%c'\n", c); cannam@226: return pop_node(reader, ref); cannam@226: case '>': cannam@226: eat_byte_safe(reader, c); cannam@226: return ref; cannam@226: case '\\': cannam@226: eat_byte_safe(reader, c); cannam@226: if (!read_UCHAR(reader, ref, &code)) { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape\n"); cannam@226: return pop_node(reader, ref); cannam@226: } cannam@226: switch (code) { cannam@226: case 0: case ' ': case '<': case '>': cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "invalid escaped IRI character %X %c\n", code, code); cannam@226: return pop_node(reader, ref); cannam@226: } cannam@226: break; cannam@226: default: cannam@226: if (c <= 0x20) { cannam@226: if (isprint(c)) { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "invalid IRI character `%c' (escape %%%02X)\n", c, c); cannam@226: } else { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "invalid IRI character (escape %%%02X)\n", c, c); cannam@226: } cannam@226: if (reader->strict) { cannam@226: return pop_node(reader, ref); cannam@226: } cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: } else { cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: } cannam@226: } cannam@226: } cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_PrefixedName(SerdReader* reader, Ref dest, bool read_prefix, bool* ate_dot) cannam@226: { cannam@226: if (read_prefix && read_PN_PREFIX(reader, dest) > SERD_FAILURE) { cannam@226: return false; cannam@226: } else if (peek_byte(reader) != ':') { cannam@226: return false; cannam@226: } cannam@226: cannam@226: push_byte(reader, dest, eat_byte_safe(reader, ':')); cannam@226: return read_PN_LOCAL(reader, dest, ate_dot) <= SERD_FAILURE; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_0_9(SerdReader* reader, Ref str, bool at_least_one) cannam@226: { cannam@226: unsigned count = 0; cannam@226: for (uint8_t c; is_digit((c = peek_byte(reader))); ++count) { cannam@226: push_byte(reader, str, eat_byte_safe(reader, c)); cannam@226: } cannam@226: if (at_least_one && count == 0) { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, "expected digit\n"); cannam@226: } cannam@226: return count; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_number(SerdReader* reader, Ref* dest, Ref* datatype, bool* ate_dot) cannam@226: { cannam@226: #define XSD_DECIMAL NS_XSD "decimal" cannam@226: #define XSD_DOUBLE NS_XSD "double" cannam@226: #define XSD_INTEGER NS_XSD "integer" cannam@226: Ref ref = push_node(reader, SERD_LITERAL, "", 0); cannam@226: uint8_t c = peek_byte(reader); cannam@226: bool has_decimal = false; cannam@226: if (c == '-' || c == '+') { cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: } cannam@226: if ((c = peek_byte(reader)) == '.') { cannam@226: has_decimal = true; cannam@226: // decimal case 2 (e.g. '.0' or `-.0' or `+.0') cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: TRY_THROW(read_0_9(reader, ref, true)); cannam@226: } else { cannam@226: // all other cases ::= ( '-' | '+' ) [0-9]+ ( . )? ( [0-9]+ )? ... cannam@226: TRY_THROW(is_digit(c)); cannam@226: read_0_9(reader, ref, true); cannam@226: if ((c = peek_byte(reader)) == '.') { cannam@226: has_decimal = true; cannam@226: cannam@226: // Annoyingly, dot can be end of statement, so tentatively eat cannam@226: eat_byte_safe(reader, c); cannam@226: c = peek_byte(reader); cannam@226: if (!is_digit(c) && c != 'e' && c != 'E') { cannam@226: *dest = ref; cannam@226: *ate_dot = true; // Force caller to deal with stupid grammar cannam@226: return true; // Next byte is not a number character, done cannam@226: } cannam@226: cannam@226: push_byte(reader, ref, '.'); cannam@226: read_0_9(reader, ref, false); cannam@226: } cannam@226: } cannam@226: c = peek_byte(reader); cannam@226: if (c == 'e' || c == 'E') { cannam@226: // double cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: switch ((c = peek_byte(reader))) { cannam@226: case '+': case '-': cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: default: break; cannam@226: } cannam@226: TRY_THROW(read_0_9(reader, ref, true)); cannam@226: *datatype = push_node(reader, SERD_URI, cannam@226: XSD_DOUBLE, sizeof(XSD_DOUBLE) - 1); cannam@226: } else if (has_decimal) { cannam@226: *datatype = push_node(reader, SERD_URI, cannam@226: XSD_DECIMAL, sizeof(XSD_DECIMAL) - 1); cannam@226: } else { cannam@226: *datatype = push_node(reader, SERD_URI, cannam@226: XSD_INTEGER, sizeof(XSD_INTEGER) - 1); cannam@226: } cannam@226: *dest = ref; cannam@226: return true; cannam@226: except: cannam@226: pop_node(reader, *datatype); cannam@226: pop_node(reader, ref); cannam@226: return false; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_iri(SerdReader* reader, Ref* dest, bool* ate_dot) cannam@226: { cannam@226: switch (peek_byte(reader)) { cannam@226: case '<': cannam@226: *dest = read_IRIREF(reader); cannam@226: return true; cannam@226: default: cannam@226: *dest = push_node(reader, SERD_CURIE, "", 0); cannam@226: return read_PrefixedName(reader, *dest, true, ate_dot); cannam@226: } cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_literal(SerdReader* reader, Ref* dest, cannam@226: Ref* datatype, Ref* lang, SerdNodeFlags* flags, bool* ate_dot) cannam@226: { cannam@226: Ref str = read_String(reader, flags); cannam@226: if (!str) { cannam@226: return false; cannam@226: } cannam@226: cannam@226: switch (peek_byte(reader)) { cannam@226: case '@': cannam@226: eat_byte_safe(reader, '@'); cannam@226: TRY_THROW(*lang = read_LANGTAG(reader)); cannam@226: break; cannam@226: case '^': cannam@226: eat_byte_safe(reader, '^'); cannam@226: eat_byte_check(reader, '^'); cannam@226: TRY_THROW(read_iri(reader, datatype, ate_dot)); cannam@226: break; cannam@226: } cannam@226: *dest = str; cannam@226: return true; cannam@226: except: cannam@226: *datatype = pop_node(reader, *datatype); cannam@226: *lang = pop_node(reader, *lang); cannam@226: pop_node(reader, str); cannam@226: return false; cannam@226: } cannam@226: cannam@226: inline static bool cannam@226: is_token_end(uint8_t c) cannam@226: { cannam@226: switch (c) { cannam@226: case 0x9: case 0xA: case 0xD: case 0x20: case '\0': cannam@226: case '#': case '.': case ';': case '<': cannam@226: return true; cannam@226: default: cannam@226: return false; cannam@226: } cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_verb(SerdReader* reader, Ref* dest) cannam@226: { cannam@226: if (peek_byte(reader) == '<') { cannam@226: return (*dest = read_IRIREF(reader)); cannam@226: } else { cannam@226: /* Either a qname, or "a". Read the prefix first, and if it is in fact cannam@226: "a", produce that instead. cannam@226: */ cannam@226: *dest = push_node(reader, SERD_CURIE, "", 0); cannam@226: SerdNode* node = deref(reader, *dest); cannam@226: const SerdStatus st = read_PN_PREFIX(reader, *dest); cannam@226: bool ate_dot = false; cannam@226: if (!st && node->n_bytes == 1 && node->buf[0] == 'a' && cannam@226: is_token_end(peek_byte(reader))) { cannam@226: pop_node(reader, *dest); cannam@226: return (*dest = push_node(reader, SERD_URI, NS_RDF "type", 47)); cannam@226: } else if (st > SERD_FAILURE || cannam@226: !read_PrefixedName(reader, *dest, false, &ate_dot) || cannam@226: ate_dot) { cannam@226: return (*dest = pop_node(reader, *dest)); cannam@226: } else { cannam@226: return true; cannam@226: } cannam@226: } cannam@226: return false; cannam@226: } cannam@226: cannam@226: static Ref cannam@226: read_BLANK_NODE_LABEL(SerdReader* reader, bool* ate_dot) cannam@226: { cannam@226: eat_byte_safe(reader, '_'); cannam@226: eat_byte_check(reader, ':'); cannam@226: Ref ref = push_node(reader, SERD_BLANK, cannam@226: reader->bprefix ? (char*)reader->bprefix : "", cannam@226: reader->bprefix_len); cannam@226: cannam@226: uint8_t c = peek_byte(reader); // First: (PN_CHARS | '_' | [0-9]) cannam@226: if (is_digit(c) || c == '_') { cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: } else if (!read_PN_CHARS(reader, ref)) { cannam@226: r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid name start character\n"); cannam@226: return pop_node(reader, ref); cannam@226: } cannam@226: cannam@226: while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')* cannam@226: if (c == '.') { cannam@226: push_byte(reader, ref, eat_byte_safe(reader, c)); cannam@226: } else if (!read_PN_CHARS(reader, ref)) { cannam@226: break; cannam@226: } cannam@226: } cannam@226: cannam@226: SerdNode* n = deref(reader, ref); cannam@226: if (n->buf[n->n_bytes - 1] == '.' && !read_PN_CHARS(reader, ref)) { cannam@226: // Ate trailing dot, pop it from stack/node and inform caller cannam@226: --n->n_bytes; cannam@226: serd_stack_pop(&reader->stack, 1); cannam@226: *ate_dot = true; cannam@226: } cannam@226: cannam@226: if (reader->syntax == SERD_TURTLE) { cannam@226: if (is_digit(n->buf[reader->bprefix_len + 1])) { cannam@226: if ((n->buf[reader->bprefix_len]) == 'b') { cannam@226: ((char*)n->buf)[reader->bprefix_len] = 'B'; // Prevent clash cannam@226: reader->seen_genid = true; cannam@226: } else if (reader->seen_genid && cannam@226: n->buf[reader->bprefix_len] == 'B') { cannam@226: r_err(reader, SERD_ERR_ID_CLASH, cannam@226: "found both `b' and `B' blank IDs, prefix required\n"); cannam@226: return pop_node(reader, ref); cannam@226: } cannam@226: } cannam@226: } cannam@226: return ref; cannam@226: } cannam@226: cannam@226: static void cannam@226: set_blank_id(SerdReader* reader, Ref ref, size_t buf_size) cannam@226: { cannam@226: SerdNode* node = deref(reader, ref); cannam@226: const char* prefix = reader->bprefix ? (const char*)reader->bprefix : ""; cannam@226: node->n_bytes = node->n_chars = snprintf( cannam@226: (char*)node->buf, buf_size, "%sb%u", prefix, reader->next_id++); cannam@226: } cannam@226: cannam@226: static size_t cannam@226: genid_size(SerdReader* reader) cannam@226: { cannam@226: return reader->bprefix_len + 1 + 10 + 1; // + "b" + UINT32_MAX + \0 cannam@226: } cannam@226: cannam@226: static Ref cannam@226: blank_id(SerdReader* reader) cannam@226: { cannam@226: Ref ref = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); cannam@226: set_blank_id(reader, ref, genid_size(reader)); cannam@226: return ref; cannam@226: } cannam@226: cannam@226: static Ref cannam@226: read_blankName(SerdReader* reader) cannam@226: { cannam@226: eat_byte_safe(reader, '='); cannam@226: if (eat_byte_check(reader, '=') != '=') { cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected `='\n"); cannam@226: } cannam@226: cannam@226: Ref subject = 0; cannam@226: bool ate_dot = false; cannam@226: read_ws_star(reader); cannam@226: read_iri(reader, &subject, &ate_dot); cannam@226: return subject; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_anon(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest) cannam@226: { cannam@226: const SerdStatementFlags old_flags = *ctx.flags; cannam@226: bool empty; cannam@226: eat_byte_safe(reader, '['); cannam@226: if ((empty = peek_delim(reader, ']'))) { cannam@226: *ctx.flags |= (subject) ? SERD_EMPTY_S : SERD_EMPTY_O; cannam@226: } else { cannam@226: *ctx.flags |= (subject) ? SERD_ANON_S_BEGIN : SERD_ANON_O_BEGIN; cannam@226: if (peek_delim(reader, '=')) { cannam@226: if (!(*dest = read_blankName(reader)) || cannam@226: !eat_delim(reader, ';')) { cannam@226: return false; cannam@226: } cannam@226: } cannam@226: } cannam@226: cannam@226: if (!*dest) { cannam@226: *dest = blank_id(reader); cannam@226: } cannam@226: if (ctx.subject) { cannam@226: TRY_RET(emit_statement(reader, ctx, *dest, 0, 0)); cannam@226: } cannam@226: cannam@226: ctx.subject = *dest; cannam@226: if (!empty) { cannam@226: *ctx.flags &= ~(SERD_LIST_CONT); cannam@226: if (!subject) { cannam@226: *ctx.flags |= SERD_ANON_CONT; cannam@226: } cannam@226: bool ate_dot_in_list = false; cannam@226: read_predicateObjectList(reader, ctx, &ate_dot_in_list); cannam@226: if (ate_dot_in_list) { cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, "`.' inside blank\n"); cannam@226: } cannam@226: read_ws_star(reader); cannam@226: if (reader->end_sink) { cannam@226: reader->end_sink(reader->handle, deref(reader, *dest)); cannam@226: } cannam@226: *ctx.flags = old_flags; cannam@226: } cannam@226: return (eat_byte_check(reader, ']') == ']'); cannam@226: } cannam@226: cannam@226: /* If emit is true: recurses, calling statement_sink for every statement cannam@226: encountered, and leaves stack in original calling state (i.e. pops cannam@226: everything it pushes). */ cannam@226: static bool cannam@226: read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot) cannam@226: { cannam@226: static const char* const XSD_BOOLEAN = NS_XSD "boolean"; cannam@226: static const size_t XSD_BOOLEAN_LEN = 40; cannam@226: cannam@226: #ifndef NDEBUG cannam@226: const size_t orig_stack_size = reader->stack.size; cannam@226: #endif cannam@226: cannam@226: bool ret = false; cannam@226: bool simple = (ctx->subject != 0); cannam@226: SerdNode* node = NULL; cannam@226: Ref o = 0; cannam@226: Ref datatype = 0; cannam@226: Ref lang = 0; cannam@226: uint32_t flags = 0; cannam@226: const uint8_t c = peek_byte(reader); cannam@226: if (!supports_fancy_literals(reader)) { cannam@226: switch (c) { cannam@226: case '"': case ':': case '<': case '_': break; cannam@226: default: return r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "expected: ':', '<', or '_'\n"); cannam@226: } cannam@226: } cannam@226: switch (c) { cannam@226: case '\0': cannam@226: case ')': cannam@226: return false; cannam@226: case '[': cannam@226: simple = false; cannam@226: TRY_THROW(ret = read_anon(reader, *ctx, false, &o)); cannam@226: break; cannam@226: case '(': cannam@226: simple = false; cannam@226: TRY_THROW(ret = read_collection(reader, *ctx, &o)); cannam@226: break; cannam@226: case '_': cannam@226: TRY_THROW(ret = (o = read_BLANK_NODE_LABEL(reader, ate_dot))); cannam@226: break; cannam@226: case '<': case ':': cannam@226: TRY_THROW(ret = read_iri(reader, &o, ate_dot)); cannam@226: break; cannam@226: case '+': case '-': case '.': case '0': case '1': case '2': case '3': cannam@226: case '4': case '5': case '6': case '7': case '8': case '9': cannam@226: TRY_THROW(ret = read_number(reader, &o, &datatype, ate_dot)); cannam@226: break; cannam@226: case '\"': cannam@226: case '\'': cannam@226: TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags, ate_dot)); cannam@226: break; cannam@226: default: cannam@226: /* Either a boolean literal, or a qname. Read the prefix first, and if cannam@226: it is in fact a "true" or "false" literal, produce that instead. cannam@226: */ cannam@226: node = deref(reader, o = push_node(reader, SERD_CURIE, "", 0)); cannam@226: while (read_PN_CHARS_BASE(reader, o)) {} cannam@226: if ((node->n_bytes == 4 && !memcmp(node->buf, "true", 4)) || cannam@226: (node->n_bytes == 5 && !memcmp(node->buf, "false", 5))) { cannam@226: node->type = SERD_LITERAL; cannam@226: datatype = push_node( cannam@226: reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN); cannam@226: ret = true; cannam@226: } else if (read_PN_PREFIX_tail(reader, o) > SERD_FAILURE) { cannam@226: ret = false; cannam@226: } else { cannam@226: ret = read_PrefixedName(reader, o, false, ate_dot); cannam@226: } cannam@226: } cannam@226: cannam@226: if (simple && o) { cannam@226: deref(reader, o)->flags = flags; cannam@226: } cannam@226: cannam@226: if (ret && emit && simple) { cannam@226: ret = emit_statement(reader, *ctx, o, datatype, lang); cannam@226: } else if (ret && !emit) { cannam@226: ctx->object = o; cannam@226: ctx->datatype = datatype; cannam@226: ctx->lang = lang; cannam@226: return true; cannam@226: } cannam@226: cannam@226: except: cannam@226: pop_node(reader, lang); cannam@226: pop_node(reader, datatype); cannam@226: pop_node(reader, o); cannam@226: #ifndef NDEBUG cannam@226: assert(reader->stack.size == orig_stack_size); cannam@226: #endif cannam@226: return ret; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_objectList(SerdReader* reader, ReadContext ctx, bool* ate_dot) cannam@226: { cannam@226: TRY_RET(read_object(reader, &ctx, true, ate_dot)); cannam@226: while (!*ate_dot && eat_delim(reader, ',')) { cannam@226: TRY_RET(read_object(reader, &ctx, true, ate_dot)); cannam@226: } cannam@226: return true; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot) cannam@226: { cannam@226: uint8_t c; cannam@226: while (true) { cannam@226: TRY_THROW(read_verb(reader, &ctx.predicate)); cannam@226: read_ws_star(reader); cannam@226: cannam@226: TRY_THROW(read_objectList(reader, ctx, ate_dot)); cannam@226: ctx.predicate = pop_node(reader, ctx.predicate); cannam@226: if (*ate_dot) { cannam@226: return true; cannam@226: } cannam@226: cannam@226: bool ate_semi = false; cannam@226: do { cannam@226: read_ws_star(reader); cannam@226: switch (c = peek_byte(reader)) { cannam@226: case 0: cannam@226: return false; cannam@226: case '.': case ']': case '}': cannam@226: return true; cannam@226: case ';': cannam@226: eat_byte_safe(reader, c); cannam@226: ate_semi = true; cannam@226: } cannam@226: } while (c == ';'); cannam@226: cannam@226: if (!ate_semi) { cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, "missing ';' or '.'\n"); cannam@226: } cannam@226: } cannam@226: cannam@226: pop_node(reader, ctx.predicate); cannam@226: return true; cannam@226: except: cannam@226: pop_node(reader, ctx.predicate); cannam@226: return false; cannam@226: } cannam@226: cannam@226: static bool cannam@226: end_collection(SerdReader* reader, ReadContext ctx, Ref n1, Ref n2, bool ret) cannam@226: { cannam@226: pop_node(reader, n2); cannam@226: pop_node(reader, n1); cannam@226: *ctx.flags &= ~SERD_LIST_CONT; cannam@226: return ret && (eat_byte_safe(reader, ')') == ')'); cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) cannam@226: { cannam@226: eat_byte_safe(reader, '('); cannam@226: bool end = peek_delim(reader, ')'); cannam@226: *dest = end ? reader->rdf_nil : blank_id(reader); cannam@226: if (ctx.subject) { cannam@226: // subject predicate _:head cannam@226: *ctx.flags |= (end ? 0 : SERD_LIST_O_BEGIN); cannam@226: TRY_RET(emit_statement(reader, ctx, *dest, 0, 0)); cannam@226: *ctx.flags |= SERD_LIST_CONT; cannam@226: } else { cannam@226: *ctx.flags |= (end ? 0 : SERD_LIST_S_BEGIN); cannam@226: } cannam@226: cannam@226: if (end) { cannam@226: return end_collection(reader, ctx, 0, 0, true); cannam@226: } cannam@226: cannam@226: /* The order of node allocation here is necessarily not in stack order, cannam@226: so we create two nodes and recycle them throughout. */ cannam@226: Ref n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); cannam@226: Ref n2 = 0; cannam@226: Ref node = n1; cannam@226: Ref rest = 0; cannam@226: cannam@226: ctx.subject = *dest; cannam@226: while (!(end = peek_delim(reader, ')'))) { cannam@226: // _:node rdf:first object cannam@226: ctx.predicate = reader->rdf_first; cannam@226: bool ate_dot = false; cannam@226: if (!read_object(reader, &ctx, true, &ate_dot) || ate_dot) { cannam@226: return end_collection(reader, ctx, n1, n2, false); cannam@226: } cannam@226: cannam@226: if (!(end = peek_delim(reader, ')'))) { cannam@226: /* Give rest a new ID. Done as late as possible to ensure it is cannam@226: used and > IDs generated by read_object above. */ cannam@226: if (!rest) { cannam@226: rest = n2 = blank_id(reader); // First pass, push cannam@226: } else { cannam@226: set_blank_id(reader, rest, genid_size(reader)); cannam@226: } cannam@226: } cannam@226: cannam@226: // _:node rdf:rest _:rest cannam@226: *ctx.flags |= SERD_LIST_CONT; cannam@226: ctx.predicate = reader->rdf_rest; cannam@226: TRY_RET(emit_statement(reader, ctx, cannam@226: (end ? reader->rdf_nil : rest), 0, 0)); cannam@226: cannam@226: ctx.subject = rest; // _:node = _:rest cannam@226: rest = node; // _:rest = (old)_:node cannam@226: node = ctx.subject; // invariant cannam@226: } cannam@226: cannam@226: return end_collection(reader, ctx, n1, n2, true); cannam@226: } cannam@226: cannam@226: static Ref cannam@226: read_subject(SerdReader* reader, ReadContext ctx, Ref* dest, char* s_type) cannam@226: { cannam@226: bool ate_dot = false; cannam@226: switch ((*s_type = peek_byte(reader))) { cannam@226: case '[': cannam@226: read_anon(reader, ctx, true, dest); cannam@226: break; cannam@226: case '(': cannam@226: read_collection(reader, ctx, dest); cannam@226: break; cannam@226: case '_': cannam@226: *dest = read_BLANK_NODE_LABEL(reader, &ate_dot); cannam@226: break; cannam@226: default: cannam@226: TRY_RET(read_iri(reader, dest, &ate_dot)); cannam@226: } cannam@226: return ate_dot ? pop_node(reader, *dest) : *dest; cannam@226: } cannam@226: cannam@226: static Ref cannam@226: read_labelOrSubject(SerdReader* reader, ReadContext ctx) cannam@226: { cannam@226: Ref subject = 0; cannam@226: bool ate_dot = false; cannam@226: switch (peek_byte(reader)) { cannam@226: case '[': cannam@226: eat_byte_safe(reader, '['); cannam@226: read_ws_star(reader); cannam@226: TRY_RET(eat_byte_check(reader, ']')); cannam@226: return blank_id(reader); cannam@226: case '_': cannam@226: return read_BLANK_NODE_LABEL(reader, &ate_dot); cannam@226: default: cannam@226: read_iri(reader, &subject, &ate_dot); cannam@226: } cannam@226: return subject; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_triples(SerdReader* reader, ReadContext ctx, bool* ate_dot) cannam@226: { cannam@226: bool ret = false; cannam@226: if (ctx.subject) { cannam@226: read_ws_star(reader); cannam@226: switch (peek_byte(reader)) { cannam@226: case '.': cannam@226: *ate_dot = eat_byte_safe(reader, '.'); cannam@226: return false; cannam@226: case '}': cannam@226: return false; cannam@226: } cannam@226: ret = read_predicateObjectList(reader, ctx, ate_dot); cannam@226: } cannam@226: ctx.subject = ctx.predicate = 0; cannam@226: return ret; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_base(SerdReader* reader, bool sparql, bool token) cannam@226: { cannam@226: if (token) { cannam@226: TRY_RET(eat_string(reader, "base", 4)); cannam@226: } cannam@226: cannam@226: Ref uri; cannam@226: read_ws_star(reader); cannam@226: TRY_RET(uri = read_IRIREF(reader)); cannam@226: if (reader->base_sink) { cannam@226: reader->base_sink(reader->handle, deref(reader, uri)); cannam@226: } cannam@226: pop_node(reader, uri); cannam@226: cannam@226: read_ws_star(reader); cannam@226: if (!sparql) { cannam@226: return eat_byte_check(reader, '.'); cannam@226: } else if (peek_byte(reader) == '.') { cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "full stop after SPARQL BASE\n"); cannam@226: } cannam@226: return true; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_prefixID(SerdReader* reader, bool sparql, bool token) cannam@226: { cannam@226: if (token) { cannam@226: TRY_RET(eat_string(reader, "prefix", 6)); cannam@226: } cannam@226: cannam@226: read_ws_star(reader); cannam@226: bool ret = true; cannam@226: Ref name = push_node(reader, SERD_LITERAL, "", 0); cannam@226: if (read_PN_PREFIX(reader, name) > SERD_FAILURE) { cannam@226: return pop_node(reader, name); cannam@226: } cannam@226: cannam@226: if (eat_byte_check(reader, ':') != ':') { cannam@226: return pop_node(reader, name); cannam@226: } cannam@226: cannam@226: read_ws_star(reader); cannam@226: const Ref uri = read_IRIREF(reader); cannam@226: if (!uri) { cannam@226: pop_node(reader, name); cannam@226: return false; cannam@226: } cannam@226: cannam@226: if (reader->prefix_sink) { cannam@226: ret = !reader->prefix_sink(reader->handle, cannam@226: deref(reader, name), cannam@226: deref(reader, uri)); cannam@226: } cannam@226: pop_node(reader, uri); cannam@226: pop_node(reader, name); cannam@226: if (!sparql) { cannam@226: read_ws_star(reader); cannam@226: return eat_byte_check(reader, '.'); cannam@226: } cannam@226: return ret; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_directive(SerdReader* reader) cannam@226: { cannam@226: const bool sparql = peek_byte(reader) != '@'; cannam@226: if (!sparql) { cannam@226: eat_byte_safe(reader, '@'); cannam@226: switch (peek_byte(reader)) { cannam@226: case 'B': case 'P': cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, cannam@226: "uppercase directive\n"); cannam@226: } cannam@226: } cannam@226: cannam@226: switch (peek_byte(reader)) { cannam@226: case 'B': case 'b': return read_base(reader, sparql, true); cannam@226: case 'P': case 'p': return read_prefixID(reader, sparql, true); cannam@226: default: cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid directive\n"); cannam@226: } cannam@226: cannam@226: return true; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_wrappedGraph(SerdReader* reader, ReadContext* ctx) cannam@226: { cannam@226: bool ate_dot = false; cannam@226: char s_type = 0; cannam@226: TRY_RET(eat_byte_check(reader, '{')); cannam@226: read_ws_star(reader); cannam@226: while (peek_byte(reader) != '}') { cannam@226: ctx->subject = 0; cannam@226: Ref subj = read_subject(reader, *ctx, &ctx->subject, &s_type); cannam@226: if (!subj || cannam@226: (!read_triples(reader, *ctx, &ate_dot) && s_type != '[')) { cannam@226: return false; cannam@226: } cannam@226: pop_node(reader, subj); cannam@226: read_ws_star(reader); cannam@226: if (peek_byte(reader) == '.') { cannam@226: eat_byte_safe(reader, '.'); cannam@226: } cannam@226: read_ws_star(reader); cannam@226: } cannam@226: return eat_byte_check(reader, '}'); cannam@226: } cannam@226: cannam@226: static int cannam@226: tokcmp(SerdReader* reader, Ref ref, const char* tok, size_t n) cannam@226: { cannam@226: SerdNode* node = deref(reader, ref); cannam@226: if (!node || node->n_bytes != n) { cannam@226: return -1; cannam@226: } cannam@226: const char* s1 = (const char*)node->buf; cannam@226: const char* s2 = tok; cannam@226: for (; n > 0 && *s2; s1++, s2++, --n) { cannam@226: if (toupper(*s1) != toupper(*s2)) { cannam@226: return ((*(uint8_t*)s1 < *(uint8_t*)s2) ? -1 : +1); cannam@226: } cannam@226: } cannam@226: return 0; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_statement(SerdReader* reader) cannam@226: { cannam@226: SerdStatementFlags flags = 0; cannam@226: ReadContext ctx = { 0, 0, 0, 0, 0, 0, &flags }; cannam@226: Ref subj = 0; cannam@226: bool ate_dot = false; cannam@226: char s_type = false; cannam@226: bool ret = true; cannam@226: read_ws_star(reader); cannam@226: switch (peek_byte(reader)) { cannam@226: case '\0': cannam@226: reader->eof = true; cannam@226: return reader->status <= SERD_FAILURE; cannam@226: case '@': cannam@226: TRY_RET(read_directive(reader)); cannam@226: read_ws_star(reader); cannam@226: break; cannam@226: case '{': cannam@226: if (reader->syntax == SERD_TRIG) { cannam@226: TRY_RET(read_wrappedGraph(reader, &ctx)); cannam@226: read_ws_star(reader); cannam@226: } else { cannam@226: return r_err(reader, SERD_ERR_BAD_SYNTAX, "graph in Turtle\n"); cannam@226: } cannam@226: break; cannam@226: default: cannam@226: subj = read_subject(reader, ctx, &ctx.subject, &s_type); cannam@226: if (!tokcmp(reader, ctx.subject, "base", 4)) { cannam@226: ret = read_base(reader, true, false); cannam@226: } else if (!tokcmp(reader, ctx.subject, "prefix", 6)) { cannam@226: ret = read_prefixID(reader, true, false); cannam@226: } else if (!tokcmp(reader, ctx.subject, "graph", 5)) { cannam@226: read_ws_star(reader); cannam@226: TRY_RET((ctx.graph = read_labelOrSubject(reader, ctx))); cannam@226: read_ws_star(reader); cannam@226: TRY_RET(read_wrappedGraph(reader, &ctx)); cannam@226: read_ws_star(reader); cannam@226: } else if (read_ws_star(reader) && peek_byte(reader) == '{') { cannam@226: if (s_type == '(' || (s_type == '[' && !*ctx.flags)) { cannam@226: return false; // invalid graph with complex label cannam@226: } cannam@226: ctx.graph = subj; cannam@226: ctx.subject = subj = 0; cannam@226: TRY_RET(read_wrappedGraph(reader, &ctx)); cannam@226: read_ws_star(reader); cannam@226: } else if (!subj) { cannam@226: ret = r_err(reader, SERD_ERR_BAD_SYNTAX, "bad subject\n"); cannam@226: } else if (!read_triples(reader, ctx, &ate_dot)) { cannam@226: ret = (s_type == '['); cannam@226: } else if (!ate_dot) { cannam@226: read_ws_star(reader); cannam@226: ret = (eat_byte_check(reader, '.') == '.'); cannam@226: } cannam@226: pop_node(reader, subj); cannam@226: break; cannam@226: } cannam@226: return ret; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_turtleDoc(SerdReader* reader) cannam@226: { cannam@226: while (!reader->eof) { cannam@226: TRY_RET(read_statement(reader)); cannam@226: } cannam@226: return reader->status <= SERD_FAILURE; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_trigDoc(SerdReader* reader) cannam@226: { cannam@226: while (!reader->eof) { cannam@226: TRY_RET(read_statement(reader)); cannam@226: } cannam@226: return reader->status <= SERD_FAILURE; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_nquadsDoc(SerdReader* reader) cannam@226: { cannam@226: while (!reader->eof) { cannam@226: SerdStatementFlags flags = 0; cannam@226: ReadContext ctx = { 0, 0, 0, 0, 0, 0, &flags }; cannam@226: bool ate_dot = false; cannam@226: char s_type = false; cannam@226: read_ws_star(reader); cannam@226: if (peek_byte(reader) == '\0') { cannam@226: reader->eof = true; cannam@226: break; cannam@226: } cannam@226: cannam@226: // subject predicate object cannam@226: if (!(ctx.subject = read_subject(reader, ctx, &ctx.subject, &s_type)) || cannam@226: !read_ws_star(reader) || cannam@226: !(ctx.predicate = read_IRIREF(reader)) || cannam@226: !read_ws_star(reader) || cannam@226: !read_object(reader, &ctx, false, &ate_dot)) { cannam@226: return false; cannam@226: } cannam@226: cannam@226: if (!ate_dot) { // graphLabel? cannam@226: TRY_RET(read_ws_star(reader)); cannam@226: switch (peek_byte(reader)) { cannam@226: case '.': cannam@226: break; cannam@226: case '_': cannam@226: ctx.graph = read_BLANK_NODE_LABEL(reader, &ate_dot); cannam@226: break; cannam@226: default: cannam@226: if (!(ctx.graph = read_IRIREF(reader))) { cannam@226: return false; cannam@226: } cannam@226: } cannam@226: cannam@226: // Terminating '.' cannam@226: TRY_RET(read_ws_star(reader)); cannam@226: eat_byte_check(reader, '.'); cannam@226: } cannam@226: cannam@226: TRY_RET(emit_statement(reader, ctx, ctx.object, ctx.datatype, ctx.lang)); cannam@226: pop_node(reader, ctx.graph); cannam@226: pop_node(reader, ctx.lang); cannam@226: pop_node(reader, ctx.datatype); cannam@226: pop_node(reader, ctx.object); cannam@226: } cannam@226: return reader->status <= SERD_FAILURE; cannam@226: } cannam@226: cannam@226: static bool cannam@226: read_doc(SerdReader* reader) cannam@226: { cannam@226: switch (reader->syntax) { cannam@226: case SERD_NQUADS: return read_nquadsDoc(reader); cannam@226: case SERD_TRIG: return read_trigDoc(reader); cannam@226: default: return read_turtleDoc(reader); cannam@226: } cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: SerdReader* cannam@226: serd_reader_new(SerdSyntax syntax, cannam@226: void* handle, cannam@226: void (*free_handle)(void*), cannam@226: SerdBaseSink base_sink, cannam@226: SerdPrefixSink prefix_sink, cannam@226: SerdStatementSink statement_sink, cannam@226: SerdEndSink end_sink) cannam@226: { cannam@226: const Cursor cur = { NULL, 0, 0 }; cannam@226: SerdReader* me = (SerdReader*)calloc(1, sizeof(SerdReader)); cannam@226: me->handle = handle; cannam@226: me->free_handle = free_handle; cannam@226: me->base_sink = base_sink; cannam@226: me->prefix_sink = prefix_sink; cannam@226: me->statement_sink = statement_sink; cannam@226: me->end_sink = end_sink; cannam@226: me->default_graph = SERD_NODE_NULL; cannam@226: me->stack = serd_stack_new(SERD_PAGE_SIZE); cannam@226: me->syntax = syntax; cannam@226: me->cur = cur; cannam@226: me->next_id = 1; cannam@226: cannam@226: me->rdf_first = push_node(me, SERD_URI, NS_RDF "first", 48); cannam@226: me->rdf_rest = push_node(me, SERD_URI, NS_RDF "rest", 47); cannam@226: me->rdf_nil = push_node(me, SERD_URI, NS_RDF "nil", 46); cannam@226: cannam@226: return me; cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: void cannam@226: serd_reader_set_strict(SerdReader* reader, bool strict) cannam@226: { cannam@226: reader->strict = strict; cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: void cannam@226: serd_reader_set_error_sink(SerdReader* reader, cannam@226: SerdErrorSink error_sink, cannam@226: void* error_handle) cannam@226: { cannam@226: reader->error_sink = error_sink; cannam@226: reader->error_handle = error_handle; cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: void cannam@226: serd_reader_free(SerdReader* reader) cannam@226: { cannam@226: pop_node(reader, reader->rdf_nil); cannam@226: pop_node(reader, reader->rdf_rest); cannam@226: pop_node(reader, reader->rdf_first); cannam@226: serd_node_free(&reader->default_graph); cannam@226: cannam@226: #ifdef SERD_STACK_CHECK cannam@226: free(reader->allocs); cannam@226: #endif cannam@226: free(reader->stack.buf); cannam@226: free(reader->bprefix); cannam@226: if (reader->free_handle) { cannam@226: reader->free_handle(reader->handle); cannam@226: } cannam@226: free(reader); cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: void* cannam@226: serd_reader_get_handle(const SerdReader* reader) cannam@226: { cannam@226: return reader->handle; cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: void cannam@226: serd_reader_add_blank_prefix(SerdReader* reader, cannam@226: const uint8_t* prefix) cannam@226: { cannam@226: free(reader->bprefix); cannam@226: reader->bprefix_len = 0; cannam@226: reader->bprefix = NULL; cannam@226: if (prefix) { cannam@226: reader->bprefix_len = strlen((const char*)prefix); cannam@226: reader->bprefix = (uint8_t*)malloc(reader->bprefix_len + 1); cannam@226: memcpy(reader->bprefix, prefix, reader->bprefix_len + 1); cannam@226: } cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: void cannam@226: serd_reader_set_default_graph(SerdReader* reader, cannam@226: const SerdNode* graph) cannam@226: { cannam@226: serd_node_free(&reader->default_graph); cannam@226: reader->default_graph = serd_node_copy(graph); cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: SerdStatus cannam@226: serd_reader_read_file(SerdReader* reader, cannam@226: const uint8_t* uri) cannam@226: { cannam@226: uint8_t* const path = serd_file_uri_parse(uri, NULL); cannam@226: if (!path) { cannam@226: return SERD_ERR_BAD_ARG; cannam@226: } cannam@226: cannam@226: FILE* fd = serd_fopen((const char*)path, "r"); cannam@226: if (!fd) { cannam@226: free(path); cannam@226: return SERD_ERR_UNKNOWN; cannam@226: } cannam@226: cannam@226: SerdStatus ret = serd_reader_read_file_handle(reader, fd, path); cannam@226: fclose(fd); cannam@226: free(path); cannam@226: return ret; cannam@226: } cannam@226: cannam@226: static bool cannam@226: skip_bom(SerdReader* me) cannam@226: { cannam@226: if (peek_byte(me) == 0xEF) { cannam@226: eat_byte_safe(me, 0xEF); cannam@226: if (eat_byte_check(me, 0xBB) != 0xBB || cannam@226: eat_byte_check(me, 0xBF) != 0xBF) { cannam@226: return r_err(me, SERD_ERR_BAD_SYNTAX, "corrupt byte order mark\n"); cannam@226: } cannam@226: } cannam@226: cannam@226: return true; cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: SerdStatus cannam@226: serd_reader_start_stream(SerdReader* me, cannam@226: FILE* file, cannam@226: const uint8_t* name, cannam@226: bool bulk) cannam@226: { cannam@226: return serd_reader_start_source_stream( cannam@226: me, cannam@226: bulk ? (SerdSource)fread : serd_file_read_byte, cannam@226: (SerdStreamErrorFunc)ferror, cannam@226: file, cannam@226: name, cannam@226: bulk ? SERD_PAGE_SIZE : 1); cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: SerdStatus cannam@226: serd_reader_start_source_stream(SerdReader* me, cannam@226: SerdSource read_func, cannam@226: SerdStreamErrorFunc error_func, cannam@226: void* stream, cannam@226: const uint8_t* name, cannam@226: size_t page_size) cannam@226: { cannam@226: const Cursor cur = { name, 1, 1 }; cannam@226: me->cur = cur; cannam@226: cannam@226: return serd_byte_source_open_source( cannam@226: &me->source, read_func, error_func, stream, page_size); cannam@226: } cannam@226: cannam@226: static SerdStatus cannam@226: serd_reader_prepare(SerdReader* me) cannam@226: { cannam@226: me->eof = false; cannam@226: if ((me->status = serd_byte_source_prepare(&me->source))) { cannam@226: r_err(me, me->status, "read error: %s\n", strerror(errno)); cannam@226: } else if (!skip_bom(me)) { cannam@226: me->status = SERD_ERR_BAD_SYNTAX; cannam@226: } cannam@226: return me->status; cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: SerdStatus cannam@226: serd_reader_read_chunk(SerdReader* me) cannam@226: { cannam@226: SerdStatus st = SERD_SUCCESS; cannam@226: if (!me->source.prepared) { cannam@226: if ((st = serd_reader_prepare(me))) { cannam@226: return st; cannam@226: } cannam@226: } else if (me->eof) { cannam@226: me->eof = false; cannam@226: if ((st = serd_byte_source_advance(&me->source))) { cannam@226: return st; cannam@226: } cannam@226: } cannam@226: return read_statement(me) ? SERD_SUCCESS : SERD_FAILURE; cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: SerdStatus cannam@226: serd_reader_end_stream(SerdReader* me) cannam@226: { cannam@226: return serd_byte_source_close(&me->source); cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: SerdStatus cannam@226: serd_reader_read_file_handle(SerdReader* me, FILE* file, const uint8_t* name) cannam@226: { cannam@226: return serd_reader_read_source( cannam@226: me, (SerdSource)fread, (SerdStreamErrorFunc)ferror, cannam@226: file, name, SERD_PAGE_SIZE); cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: SerdStatus cannam@226: serd_reader_read_source(SerdReader* me, cannam@226: SerdSource source, cannam@226: SerdStreamErrorFunc error, cannam@226: void* stream, cannam@226: const uint8_t* name, cannam@226: size_t page_size) cannam@226: { cannam@226: SerdStatus st = serd_reader_start_source_stream( cannam@226: me, source, error, stream, name, page_size); cannam@226: cannam@226: if ((st = serd_reader_prepare(me))) { cannam@226: serd_reader_end_stream(me); cannam@226: return st; cannam@226: } else if (!read_doc(me)) { cannam@226: serd_reader_end_stream(me); cannam@226: return SERD_ERR_UNKNOWN; cannam@226: } cannam@226: cannam@226: return serd_reader_end_stream(me); cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: SerdStatus cannam@226: serd_reader_read_string(SerdReader* me, const uint8_t* utf8) cannam@226: { cannam@226: const Cursor cur = { (const uint8_t*)"(string)", 1, 1 }; cannam@226: cannam@226: serd_byte_source_open_string(&me->source, utf8); cannam@226: me->cur = cur; cannam@226: me->eof = false; cannam@226: cannam@226: SerdStatus st = serd_reader_prepare(me); cannam@226: if (!st) { cannam@226: st = read_doc(me) ? SERD_SUCCESS : SERD_ERR_UNKNOWN; cannam@226: } cannam@226: cannam@226: serd_byte_source_close(&me->source); cannam@226: cannam@226: return st; cannam@226: }