cannam@226: /* cannam@226: Copyright 2011-2014 David Robillard cannam@226: cannam@226: Permission to use, copy, modify, and/or distribute this software for any cannam@226: purpose with or without fee is hereby granted, provided that the above cannam@226: copyright notice and this permission notice appear in all copies. cannam@226: cannam@226: THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES cannam@226: WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF cannam@226: MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR cannam@226: ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES cannam@226: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN cannam@226: ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF cannam@226: OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. cannam@226: */ cannam@226: cannam@226: #include "serd_internal.h" cannam@226: cannam@226: #include cannam@226: #include cannam@226: cannam@226: // #define URI_DEBUG 1 cannam@226: cannam@226: SERD_API cannam@226: const uint8_t* cannam@226: serd_uri_to_path(const uint8_t* uri) cannam@226: { cannam@226: const uint8_t* path = uri; cannam@226: if (!is_windows_path(uri) && serd_uri_string_has_scheme(uri)) { cannam@226: if (strncmp((const char*)uri, "file:", 5)) { cannam@226: fprintf(stderr, "Non-file URI `%s'\n", uri); cannam@226: return NULL; cannam@226: } else if (!strncmp((const char*)uri, "file://localhost/", 17)) { cannam@226: path = uri + 16; cannam@226: } else if (!strncmp((const char*)uri, "file://", 7)) { cannam@226: path = uri + 7; cannam@226: } else { cannam@226: fprintf(stderr, "Invalid file URI `%s'\n", uri); cannam@226: return NULL; cannam@226: } cannam@226: if (is_windows_path(path + 1)) { cannam@226: ++path; // Special case for terrible Windows file URIs cannam@226: } cannam@226: } cannam@226: return path; cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: uint8_t* cannam@226: serd_file_uri_parse(const uint8_t* uri, uint8_t** hostname) cannam@226: { cannam@226: const uint8_t* path = uri; cannam@226: if (hostname) { cannam@226: *hostname = NULL; cannam@226: } cannam@226: if (!strncmp((const char*)uri, "file://", 7)) { cannam@226: const uint8_t* auth = uri + 7; cannam@226: if (*auth == '/') { // No hostname cannam@226: path = auth; cannam@226: } else { // Has hostname cannam@226: if (!(path = (const uint8_t*)strchr((const char*)auth, '/'))) { cannam@226: return NULL; cannam@226: } cannam@226: if (hostname) { cannam@226: *hostname = (uint8_t*)calloc(1, path - auth + 1); cannam@226: memcpy(*hostname, auth, path - auth); cannam@226: } cannam@226: } cannam@226: } cannam@226: cannam@226: if (is_windows_path(path + 1)) { cannam@226: ++path; cannam@226: } cannam@226: cannam@226: SerdChunk chunk = { NULL, 0 }; cannam@226: for (const uint8_t* s = path; *s; ++s) { cannam@226: if (*s == '%') { cannam@226: if (*(s + 1) == '%') { cannam@226: serd_chunk_sink("%", 1, &chunk); cannam@226: ++s; cannam@226: } else if (is_digit(*(s + 1)) && is_digit(*(s + 2))) { cannam@226: const uint8_t code[3] = { *(s + 1), *(s + 2), 0 }; cannam@226: uint32_t num; cannam@226: sscanf((const char*)code, "%X", &num); cannam@226: const uint8_t c = num; cannam@226: serd_chunk_sink(&c, 1, &chunk); cannam@226: s += 2; cannam@226: } else { cannam@226: s += 2; // Junk escape, ignore cannam@226: } cannam@226: } else { cannam@226: serd_chunk_sink(s, 1, &chunk); cannam@226: } cannam@226: } cannam@226: return serd_chunk_sink_finish(&chunk); cannam@226: } cannam@226: cannam@226: SERD_API cannam@226: bool cannam@226: serd_uri_string_has_scheme(const uint8_t* utf8) cannam@226: { cannam@226: // RFC3986: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) cannam@226: if (!utf8 || !is_alpha(utf8[0])) { cannam@226: return false; // Invalid scheme initial character, URI is relative cannam@226: } cannam@226: for (uint8_t c; (c = *++utf8) != '\0';) { cannam@226: switch (c) { cannam@226: case ':': cannam@226: return true; // End of scheme cannam@226: case '+': case '-': case '.': cannam@226: break; // Valid scheme character, continue cannam@226: default: cannam@226: if (!is_alpha(c) && !is_digit(c)) { cannam@226: return false; // Invalid scheme character cannam@226: } cannam@226: } cannam@226: } cannam@226: cannam@226: return false; cannam@226: } cannam@226: cannam@226: #ifdef URI_DEBUG cannam@226: static void cannam@226: serd_uri_dump(const SerdURI* uri, FILE* file) cannam@226: { cannam@226: #define PRINT_PART(range, name) \ cannam@226: if (range.buf) { \ cannam@226: fprintf(stderr, " " name " = "); \ cannam@226: fwrite((range).buf, 1, (range).len, stderr); \ cannam@226: fprintf(stderr, "\n"); \ cannam@226: } cannam@226: cannam@226: PRINT_PART(uri->scheme, "scheme "); cannam@226: PRINT_PART(uri->authority, "authority"); cannam@226: PRINT_PART(uri->path_base, "path_base"); cannam@226: PRINT_PART(uri->path, "path "); cannam@226: PRINT_PART(uri->query, "query "); cannam@226: PRINT_PART(uri->fragment, "fragment "); cannam@226: } cannam@226: #endif cannam@226: cannam@226: SERD_API cannam@226: SerdStatus cannam@226: serd_uri_parse(const uint8_t* utf8, SerdURI* uri) cannam@226: { cannam@226: *uri = SERD_URI_NULL; cannam@226: cannam@226: const uint8_t* ptr = utf8; cannam@226: cannam@226: /* See http://tools.ietf.org/html/rfc3986#section-3 cannam@226: URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] cannam@226: */ cannam@226: cannam@226: /* S3.1: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ cannam@226: if (is_alpha(*ptr)) { cannam@226: for (uint8_t c = *++ptr; true; c = *++ptr) { cannam@226: switch (c) { cannam@226: case '\0': case '/': case '?': case '#': cannam@226: ptr = utf8; cannam@226: goto path; // Relative URI (starts with path by definition) cannam@226: case ':': cannam@226: uri->scheme.buf = utf8; cannam@226: uri->scheme.len = (ptr++) - utf8; cannam@226: goto maybe_authority; // URI with scheme cannam@226: case '+': case '-': case '.': cannam@226: continue; cannam@226: default: cannam@226: if (is_alpha(c) || is_digit(c)) { cannam@226: continue; cannam@226: } cannam@226: } cannam@226: } cannam@226: } cannam@226: cannam@226: /* S3.2: The authority component is preceded by a double slash ("//") cannam@226: and is terminated by the next slash ("/"), question mark ("?"), cannam@226: or number sign ("#") character, or by the end of the URI. cannam@226: */ cannam@226: maybe_authority: cannam@226: if (*ptr == '/' && *(ptr + 1) == '/') { cannam@226: ptr += 2; cannam@226: uri->authority.buf = ptr; cannam@226: for (uint8_t c; (c = *ptr) != '\0'; ++ptr) { cannam@226: switch (c) { cannam@226: case '/': goto path; cannam@226: case '?': goto query; cannam@226: case '#': goto fragment; cannam@226: default: cannam@226: ++uri->authority.len; cannam@226: } cannam@226: } cannam@226: } cannam@226: cannam@226: /* RFC3986 S3.3: The path is terminated by the first question mark ("?") cannam@226: or number sign ("#") character, or by the end of the URI. cannam@226: */ cannam@226: path: cannam@226: switch (*ptr) { cannam@226: case '?': goto query; cannam@226: case '#': goto fragment; cannam@226: case '\0': goto end; cannam@226: default: break; cannam@226: } cannam@226: uri->path.buf = ptr; cannam@226: uri->path.len = 0; cannam@226: for (uint8_t c; (c = *ptr) != '\0'; ++ptr) { cannam@226: switch (c) { cannam@226: case '?': goto query; cannam@226: case '#': goto fragment; cannam@226: default: cannam@226: ++uri->path.len; cannam@226: } cannam@226: } cannam@226: cannam@226: /* RFC3986 S3.4: The query component is indicated by the first question cannam@226: mark ("?") character and terminated by a number sign ("#") character cannam@226: or by the end of the URI. cannam@226: */ cannam@226: query: cannam@226: if (*ptr == '?') { cannam@226: uri->query.buf = ++ptr; cannam@226: for (uint8_t c; (c = *ptr) != '\0'; ++ptr) { cannam@226: switch (c) { cannam@226: case '#': cannam@226: goto fragment; cannam@226: default: cannam@226: ++uri->query.len; cannam@226: } cannam@226: } cannam@226: } cannam@226: cannam@226: /* RFC3986 S3.5: A fragment identifier component is indicated by the cannam@226: presence of a number sign ("#") character and terminated by the end cannam@226: of the URI. cannam@226: */ cannam@226: fragment: cannam@226: if (*ptr == '#') { cannam@226: uri->fragment.buf = ptr; cannam@226: while (*ptr++ != '\0') { cannam@226: ++uri->fragment.len; cannam@226: } cannam@226: } cannam@226: cannam@226: end: cannam@226: #ifdef URI_DEBUG cannam@226: fprintf(stderr, "PARSE URI <%s>\n", utf8); cannam@226: serd_uri_dump(uri, stderr); cannam@226: fprintf(stderr, "\n"); cannam@226: #endif cannam@226: cannam@226: return SERD_SUCCESS; cannam@226: } cannam@226: cannam@226: /** cannam@226: Remove leading dot components from `path`. cannam@226: See http://tools.ietf.org/html/rfc3986#section-5.2.3 cannam@226: @param up Set to the number of up-references (e.g. "../") trimmed cannam@226: @return A pointer to the new start of `path` cannam@226: */ cannam@226: static const uint8_t* cannam@226: remove_dot_segments(const uint8_t* path, size_t len, size_t* up) cannam@226: { cannam@226: const uint8_t* begin = path; cannam@226: const uint8_t* const end = path + len; cannam@226: cannam@226: *up = 0; cannam@226: while (begin < end) { cannam@226: switch (begin[0]) { cannam@226: case '.': cannam@226: switch (begin[1]) { cannam@226: case '/': cannam@226: begin += 2; // Chop leading "./" cannam@226: break; cannam@226: case '.': cannam@226: switch (begin[2]) { cannam@226: case '\0': cannam@226: ++*up; cannam@226: begin += 2; // Chop input ".." cannam@226: break; cannam@226: case '/': cannam@226: ++*up; cannam@226: begin += 3; // Chop leading "../" cannam@226: break; cannam@226: default: cannam@226: return begin; cannam@226: } cannam@226: break; cannam@226: case '\0': cannam@226: ++begin; // Chop input "." (and fall-through) cannam@226: default: cannam@226: return begin; cannam@226: } cannam@226: break; cannam@226: case '/': cannam@226: switch (begin[1]) { cannam@226: case '.': cannam@226: switch (begin[2]) { cannam@226: case '/': cannam@226: begin += 2; // Leading "/./" => "/" cannam@226: break; cannam@226: case '.': cannam@226: switch (begin[3]) { cannam@226: case '/': cannam@226: ++*up; cannam@226: begin += 3; // Leading "/../" => "/" cannam@226: } cannam@226: break; cannam@226: default: cannam@226: return begin; cannam@226: } cannam@226: } // else fall through cannam@226: default: cannam@226: return begin; // Finished chopping dot components cannam@226: } cannam@226: } cannam@226: cannam@226: return begin; cannam@226: } cannam@226: cannam@226: /// Merge `base` and `path` in-place cannam@226: static void cannam@226: merge(SerdChunk* base, SerdChunk* path) cannam@226: { cannam@226: size_t up; cannam@226: const uint8_t* begin = remove_dot_segments(path->buf, path->len, &up); cannam@226: const uint8_t* end = path->buf + path->len; cannam@226: cannam@226: if (base->len) { cannam@226: // Find the up'th last slash cannam@226: const uint8_t* base_last = (base->buf + base->len - 1); cannam@226: ++up; cannam@226: do { cannam@226: if (*base_last == '/') { cannam@226: --up; cannam@226: } cannam@226: } while (up > 0 && (--base_last > base->buf)); cannam@226: cannam@226: // Set path prefix cannam@226: base->len = base_last - base->buf + 1; cannam@226: } cannam@226: cannam@226: // Set path suffix cannam@226: path->buf = begin; cannam@226: path->len = end - begin; cannam@226: } cannam@226: cannam@226: /// See http://tools.ietf.org/html/rfc3986#section-5.2.2 cannam@226: SERD_API cannam@226: void cannam@226: serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t) cannam@226: { cannam@226: if (!base->scheme.len) { cannam@226: *t = *r; // Don't resolve against non-absolute URIs cannam@226: return; cannam@226: } cannam@226: cannam@226: t->path_base.buf = NULL; cannam@226: t->path_base.len = 0; cannam@226: if (r->scheme.len) { cannam@226: *t = *r; cannam@226: } else { cannam@226: if (r->authority.len) { cannam@226: t->authority = r->authority; cannam@226: t->path = r->path; cannam@226: t->query = r->query; cannam@226: } else { cannam@226: t->path = r->path; cannam@226: if (!r->path.len) { cannam@226: t->path_base = base->path; cannam@226: if (r->query.len) { cannam@226: t->query = r->query; cannam@226: } else { cannam@226: t->query = base->query; cannam@226: } cannam@226: } else { cannam@226: if (r->path.buf[0] != '/') { cannam@226: t->path_base = base->path; cannam@226: } cannam@226: merge(&t->path_base, &t->path); cannam@226: t->query = r->query; cannam@226: } cannam@226: t->authority = base->authority; cannam@226: } cannam@226: t->scheme = base->scheme; cannam@226: t->fragment = r->fragment; cannam@226: } cannam@226: cannam@226: #ifdef URI_DEBUG cannam@226: fprintf(stderr, "## RESOLVE URI\n# BASE\n"); cannam@226: serd_uri_dump(base, stderr); cannam@226: fprintf(stderr, "# URI\n"); cannam@226: serd_uri_dump(r, stderr); cannam@226: fprintf(stderr, "# RESULT\n"); cannam@226: serd_uri_dump(t, stderr); cannam@226: fprintf(stderr, "\n"); cannam@226: #endif cannam@226: } cannam@226: cannam@226: /** Write the path of `uri` starting at index `i` */ cannam@226: static size_t cannam@226: write_path_tail(SerdSink sink, void* stream, const SerdURI* uri, size_t i) cannam@226: { cannam@226: size_t len = 0; cannam@226: if (i < uri->path_base.len) { cannam@226: len += sink(uri->path_base.buf + i, uri->path_base.len - i, stream); cannam@226: } cannam@226: if (uri->path.buf) { cannam@226: if (i < uri->path_base.len) { cannam@226: len += sink(uri->path.buf, uri->path.len, stream); cannam@226: } else { cannam@226: const size_t j = (i - uri->path_base.len); cannam@226: len += sink(uri->path.buf + j, uri->path.len - j, stream); cannam@226: } cannam@226: } cannam@226: return len; cannam@226: } cannam@226: cannam@226: /** Write the path of `uri` relative to the path of `base`. */ cannam@226: static size_t cannam@226: write_rel_path(SerdSink sink, cannam@226: void* stream, cannam@226: const SerdURI* uri, cannam@226: const SerdURI* base) cannam@226: { cannam@226: const size_t path_len = uri_path_len(uri); cannam@226: const size_t base_len = uri_path_len(base); cannam@226: const size_t min_len = (path_len < base_len) ? path_len : base_len; cannam@226: cannam@226: // Find the last separator common to both paths cannam@226: size_t last_shared_sep = 0; cannam@226: size_t i = 0; cannam@226: for (; i < min_len && uri_path_at(uri, i) == uri_path_at(base, i); ++i) { cannam@226: if (uri_path_at(uri, i) == '/') { cannam@226: last_shared_sep = i; cannam@226: } cannam@226: } cannam@226: cannam@226: if (i == path_len && i == base_len) { // Paths are identical cannam@226: return 0; cannam@226: } else if (last_shared_sep == 0) { // No common components cannam@226: return write_path_tail(sink, stream, uri, 0); cannam@226: } cannam@226: cannam@226: // Find the number of up references ("..") required cannam@226: size_t up = 0; cannam@226: for (size_t s = last_shared_sep + 1; s < base_len; ++s) { cannam@226: if (uri_path_at(base, s) == '/') { cannam@226: ++up; cannam@226: } cannam@226: } cannam@226: cannam@226: // Write up references cannam@226: size_t len = 0; cannam@226: for (size_t u = 0; u < up; ++u) { cannam@226: len += sink("../", 3, stream); cannam@226: } cannam@226: cannam@226: // Write suffix cannam@226: return len += write_path_tail(sink, stream, uri, last_shared_sep + 1); cannam@226: } cannam@226: cannam@226: /// See http://tools.ietf.org/html/rfc3986#section-5.3 cannam@226: SERD_API cannam@226: size_t cannam@226: serd_uri_serialise_relative(const SerdURI* uri, cannam@226: const SerdURI* base, cannam@226: const SerdURI* root, cannam@226: SerdSink sink, cannam@226: void* stream) cannam@226: { cannam@226: size_t len = 0; cannam@226: const bool relative = uri_is_under(uri, root ? root : base); cannam@226: if (relative) { cannam@226: len = write_rel_path(sink, stream, uri, base); cannam@226: } cannam@226: if (!relative || (!len && base->query.buf)) { cannam@226: if (uri->scheme.buf) { cannam@226: len += sink(uri->scheme.buf, uri->scheme.len, stream); cannam@226: len += sink(":", 1, stream); cannam@226: } cannam@226: if (uri->authority.buf) { cannam@226: len += sink("//", 2, stream); cannam@226: len += sink(uri->authority.buf, uri->authority.len, stream); cannam@226: } cannam@226: len += write_path_tail(sink, stream, uri, 0); cannam@226: } cannam@226: if (uri->query.buf) { cannam@226: len += sink("?", 1, stream); cannam@226: len += sink(uri->query.buf, uri->query.len, stream); cannam@226: } cannam@226: if (uri->fragment.buf) { cannam@226: // Note uri->fragment.buf includes the leading `#' cannam@226: len += sink(uri->fragment.buf, uri->fragment.len, stream); cannam@226: } cannam@226: return len; cannam@226: } cannam@226: cannam@226: /// See http://tools.ietf.org/html/rfc3986#section-5.3 cannam@226: SERD_API cannam@226: size_t cannam@226: serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream) cannam@226: { cannam@226: return serd_uri_serialise_relative(uri, NULL, NULL, sink, stream); cannam@226: }