diff options
author | Ali Fardan <raiz@stellarbound.space> | 2020-11-18 10:55:20 +0300 |
---|---|---|
committer | Ali Fardan <raiz@stellarbound.space> | 2020-11-18 10:55:20 +0300 |
commit | 6f3d10a3ec09cc278041ced4d4e506cb89a7c69a (patch) | |
tree | 5b5c7db3104d805d71029534cffcfd5d9c2aca0b /decode.c | |
download | libyuri-6f3d10a3ec09cc278041ced4d4e506cb89a7c69a.tar.gz |
initial commit; todo: thorough testing
Diffstat (limited to 'decode.c')
-rw-r--r-- | decode.c | 500 |
1 files changed, 500 insertions, 0 deletions
diff --git a/decode.c b/decode.c new file mode 100644 index 0000000..de546ba --- /dev/null +++ b/decode.c @@ -0,0 +1,500 @@ +#include <stdlib.h> +#include <string.h> + +#include "yuri.h" + +/* + * for an explanation of this hell please refer to + * RFC 3986 (Appendix A) + */ +#define _is_alpha(c)\ + ((c == 'A') ||\ + (c == 'B') ||\ + (c == 'C') ||\ + (c == 'D') ||\ + (c == 'E') ||\ + (c == 'F') ||\ + (c == 'G') ||\ + (c == 'H') ||\ + (c == 'I') ||\ + (c == 'J') ||\ + (c == 'K') ||\ + (c == 'L') ||\ + (c == 'M') ||\ + (c == 'N') ||\ + (c == 'O') ||\ + (c == 'P') ||\ + (c == 'Q') ||\ + (c == 'R') ||\ + (c == 'S') ||\ + (c == 'T') ||\ + (c == 'U') ||\ + (c == 'V') ||\ + (c == 'W') ||\ + (c == 'X') ||\ + (c == 'Y') ||\ + (c == 'Z') ||\ + (c == 'a') ||\ + (c == 'b') ||\ + (c == 'c') ||\ + (c == 'd') ||\ + (c == 'e') ||\ + (c == 'f') ||\ + (c == 'g') ||\ + (c == 'h') ||\ + (c == 'i') ||\ + (c == 'j') ||\ + (c == 'k') ||\ + (c == 'l') ||\ + (c == 'm') ||\ + (c == 'n') ||\ + (c == 'o') ||\ + (c == 'p') ||\ + (c == 'q') ||\ + (c == 'r') ||\ + (c == 's') ||\ + (c == 't') ||\ + (c == 'u') ||\ + (c == 'v') ||\ + (c == 'w') ||\ + (c == 'x') ||\ + (c == 'y') ||\ + (c == 'z')) + +#define _is_digit(c)\ + ((c == '0') ||\ + (c == '1') ||\ + (c == '2') ||\ + (c == '3') ||\ + (c == '4') ||\ + (c == '5') ||\ + (c == '6') ||\ + (c == '7') ||\ + (c == '8') ||\ + (c == '9')) + +#define _is_gen_delim(c)\ + ((c == ':') ||\ + (c == '/') ||\ + (c == '?') ||\ + (c == '#') ||\ + (c == '[') ||\ + (c == ']') ||\ + (c == '@')) + +#define _is_sub_delim(c)\ + ((c == '!') ||\ + (c == '$') ||\ + (c == '&') ||\ + (c == '\'') ||\ + (c == '(') ||\ + (c == ')') ||\ + (c == '*') ||\ + (c == '+') ||\ + (c == ',') ||\ + (c == ';') ||\ + (c == '=')) + +#define _is_unreserved(c)\ + (_is_alpha(c) ||\ + _is_digit(c) ||\ + (c == '-') ||\ + (c == '.') ||\ + (c == '_') ||\ + (c == '~')) + +#define _is_reserved(c)\ + (_is_gen_delim(c) ||\ + _is_sub_delim(c)) + +#define _is_pchar(c)\ + (_is_unreserved(c) ||\ + _is_sub_delim(c) ||\ + (c == ':') ||\ + (c == '@')) + +#define _is_segment(c)\ + _is_pchar(c) + +#define _is_segment_nc(c)\ + (_is_unreserved(c) ||\ + _is_sub_delim(c) ||\ + (c == '@')) + +/* + * TODO: I have written code for conversion to and + * from strings to abstract numbers representation + * that allows base conversion directly to any base + * efficiently, I could not find where I kept it, + * when I do this hack will be replaced with proper + * function. + * + * This is here because we want to make sure that + * when parsing numbers from strings to specify how + * long that number is represented in the string, it + * uses dynamic memory allocation for the temporary + * string which is inefficient. + */ +static int +_strtoi(const char *str, int n, int b) +{ + int ret; + char *strbuf; + + strbuf = strndup(str, n); + if (strbuf == NULL) + return -1; + + ret = strtol(strbuf, NULL, b); + free(strbuf); + + return ret; +} + +/* + * This function could be optimized too. + */ +static const char * +pct_decode(const char *text) +{ + int i, x; + int buflen; + char *buf; + char *reallocbuf; + + if (text == NULL) + return NULL; + + buflen = strlen(text)+1; + buf = strdup(text); + if (buf == NULL) + return NULL; + + x = 0; + i = 0; + while (i < buflen) { + if (text[i] == '%') { + i++; + buf[x] = _strtoi(text+i, 2, 16); + if (buf[x] == -1) { + free(buf); + return NULL; + } + i += 2; + x++; + continue; + } + buf[x] = text[i]; + i++; + x++; + } + + reallocbuf = realloc(buf, strlen(buf)+1); + if (reallocbuf == NULL) { + free(buf); + return NULL; + } + + return reallocbuf; +} + +static int +_uri_append_path(struct uri *uri, const char *item, int len) +{ + char **path; + int npath; + + if (uri->npath == 0) + npath = 1; + else + npath = uri->npath + 1; + + path = realloc(uri->path, sizeof(*uri->path)*npath); + if (path == NULL) + return -1; + uri->path = path; + uri->path[npath-1] = strndup(item, len); + if (uri->path[npath-1] == NULL) + return -1; + uri->npath = npath; + + return 0; +} + +struct uri * +uri_decode(const char *text) +{ + struct uri *ret; + const char *ptr; + const char *cpy; + const char *dup; + int dotctr; + int i; + + ret = uri_new(); + if (ret == NULL) + return NULL; + + ptr = text; + + /* look for scheme */ + if (_is_alpha(*ptr)) { + cpy = ptr; + ptr++; + while (*ptr != '\0' && (_is_alpha(*ptr) || _is_digit(*ptr) || *ptr == '+' || *ptr == '-' || *ptr == '.')) + ptr++; + if (*ptr == ':') { + ret->scheme = strndup(cpy, ptr-cpy); + if (ret->scheme == NULL) { + free(ret); + return NULL; + } + ptr++; + } else { + /* not found, rewind */ + ptr = cpy; + } + } + + /* there is authority */ + if (strncmp(ptr, "//", 2) == 0) { + ptr += 2; + + /* scan for userinfo */ + cpy = ptr; + while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr) || *ptr == ':')) { + ptr++; + /* skip pct-encoded */ + if (*ptr == '%') + ptr += 2; + } + if (*ptr == '@') { + ret->authority.user = strndup(cpy, ptr-cpy); + if (ret->authority.user == NULL) { + uri_free(ret); + return NULL; + } + ptr++; + } else { + /* not found, reset back */ + ptr = cpy; + } + + /* try IP6 */ + if (*ptr == '[') { + ptr++; + cpy = ptr; + while (*ptr != '\0' && (_is_digit(*ptr) || _is_alpha(*ptr) || *ptr == ':')) + ptr++; + if (*ptr != ']') { + uri_free(ret); + return NULL; + } + ret->authority.host = strndup(cpy, ptr-cpy); + if (ret->authority.host == NULL) { + uri_free(ret); + return NULL; + } + ret->authority.type = YURI_HOST_IP6; + ptr++; + } + + /* not found? try IP4 */ + if (ret->authority.type == 0) { + dotctr = 0; + cpy = ptr; + while (*ptr != '\0' && (_is_digit(*ptr) || *ptr == '.')) { + if (*ptr == '.') + dotctr++; + ptr++; + } + if (dotctr == 3) { + if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') { + uri_free(ret); + return NULL; + } + ret->authority.host = strndup(cpy, ptr-cpy); + if (ret->authority.host == NULL) { + uri_free(ret); + return NULL; + } + ret->authority.type = YURI_HOST_IP4; + } else { + /* not and IP4 rewind and try again */ + ptr = cpy; + } + } + + /* not found? try IPFuture (not gonna happen) */ + if (ret->authority.type == 0) { + if (*ptr == 'v') { + if ((_is_digit(*(ptr+1)) || _is_alpha(*(ptr+1))) && + (_is_digit(*(ptr+2)) || _is_alpha(*(ptr+2))) && + *(ptr+3) == '.') { + ptr += 4; + cpy = ptr; + while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr) || *ptr == ':')) + ptr++; + if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') { + uri_free(ret); + return NULL; + } + ret->authority.host = strndup(cpy, ptr-cpy); + if (ret->authority.host == NULL) { + uri_free(ret); + return NULL; + } + ret->authority.type = YURI_HOST_IPFUTURE; + } + } + } + + /* not found? try name */ + if (ret->authority.type == 0) { + while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr))) { + ptr++; + /* skip pct-encoded */ + if (*ptr == '%') + ptr += 2; + } + if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') { + uri_free(ret); + return NULL; + } + ret->authority.host = strndup(cpy, ptr-cpy); + if (ret->authority.host == NULL) { + uri_free(ret); + return NULL; + } + ret->authority.type = YURI_HOST_NAME; + } + + /* host is set, check if there's alternative port */ + if (ret->authority.host != 0 && *ptr == ':') { + ptr++; + cpy = ptr; + while (*ptr != '\0' && _is_digit(*ptr)) + ptr++; + ret->authority.port = _strtoi(cpy, ptr-cpy, 10); + if (ret->authority.port == -1) { + uri_free(ret); + return NULL; + } + } + } + + /* look for path */ + if ((ret->authority.host && *ptr == '/') || _is_segment_nc(*ptr)) { + do { + if (*ptr == '/') + ptr++; + cpy = ptr; + while (*ptr != '\0' && (ret->scheme ? _is_segment(*ptr) : _is_segment_nc(*ptr))) { + ptr++; + /* skip pct-encoded */ + if (*ptr == '%') + ptr += 2; + } + if (_uri_append_path(ret, cpy, ptr-cpy) == -1) { + uri_free(ret); + return NULL; + } + } while (*ptr != '\0' && *ptr == '/'); + } + + /* look for query */ + if (*ptr == '?') { + ptr++; + cpy = ptr; + while (*ptr != '\0' && (_is_pchar(*ptr) || *ptr == '/' || *ptr == '?')) { + ptr++; + /* skip pct-encoded */ + if (*ptr == '%') + ptr += 2; + } + ret->query = strndup(cpy, ptr-cpy); + if (ret->query == NULL) { + uri_free(ret); + return NULL; + } + } + + /* look for fragment */ + if (*ptr == '#') { + ptr++; + cpy = ptr; + while (*ptr != '\0' && (_is_pchar(*ptr) || *ptr == '/' || *ptr == '?')) { + ptr++; + /* skip pct-encoded */ + if (*ptr == '%') + ptr += 2; + } + ret->fragment = strndup(cpy, ptr-cpy); + if (ret->fragment == NULL) { + uri_free(ret); + return NULL; + } + } + + /* if there is still some trailing text, this is a bug, fail */ + if (*ptr != '\0') { + uri_free(ret); + return NULL; + } + + /* decode percent encoded characters */ + if (ret->authority.user) { + dup = pct_decode(ret->authority.user); + if (dup == NULL) { + uri_free(ret); + return NULL; + } + free(ret->authority.user); + ret->authority.user = dup; + } + if (ret->authority.host) { + dup = pct_decode(ret->authority.host); + if (dup == NULL) { + uri_free(ret); + return NULL; + } + free(ret->authority.host); + ret->authority.host = dup; + } + if (ret->npath != 0) { + for (i = 0; i < ret->npath; i++) { + dup = pct_decode(ret->path[i]); + if (dup == NULL) { + uri_free(ret); + return NULL; + } + free(ret->path[i]); + ret->path[i] = dup; + } + } + if (ret->query) { + dup = pct_decode(ret->query); + if (dup == NULL) { + uri_free(ret); + return NULL; + } + free(ret->query); + ret->query = dup; + } + if (ret->fragment) { + dup = pct_decode(ret->fragment); + if (dup == NULL) { + uri_free(ret); + return NULL; + } + free(ret->fragment); + ret->fragment = dup; + } + + if (uri_normalize(ret) == -1) { + uri_free(ret); + return NULL; + } + + return ret; +} |