summary refs log tree commit diff stats
path: root/decode.c
diff options
context:
space:
mode:
authorAli Fardan <raiz@stellarbound.space>2020-11-18 10:55:20 +0300
committerAli Fardan <raiz@stellarbound.space>2020-11-18 10:55:20 +0300
commit6f3d10a3ec09cc278041ced4d4e506cb89a7c69a (patch)
tree5b5c7db3104d805d71029534cffcfd5d9c2aca0b /decode.c
downloadlibyuri-6f3d10a3ec09cc278041ced4d4e506cb89a7c69a.tar.gz
initial commit; todo: thorough testing
Diffstat (limited to 'decode.c')
-rw-r--r--decode.c500
1 files changed, 500 insertions, 0 deletions
diff --git a/decode.c b/decode.c
new file mode 100644
index 0000000..de546ba
--- /dev/null
+++ b/decode.c
@@ -0,0 +1,500 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "yuri.h"
+
+/*
+ * for an explanation of this hell please refer to
+ * RFC 3986 (Appendix A)
+ */
+#define _is_alpha(c)\
+	((c == 'A') ||\
+	 (c == 'B') ||\
+	 (c == 'C') ||\
+	 (c == 'D') ||\
+	 (c == 'E') ||\
+	 (c == 'F') ||\
+	 (c == 'G') ||\
+	 (c == 'H') ||\
+	 (c == 'I') ||\
+	 (c == 'J') ||\
+	 (c == 'K') ||\
+	 (c == 'L') ||\
+	 (c == 'M') ||\
+	 (c == 'N') ||\
+	 (c == 'O') ||\
+	 (c == 'P') ||\
+	 (c == 'Q') ||\
+	 (c == 'R') ||\
+	 (c == 'S') ||\
+	 (c == 'T') ||\
+	 (c == 'U') ||\
+	 (c == 'V') ||\
+	 (c == 'W') ||\
+	 (c == 'X') ||\
+	 (c == 'Y') ||\
+	 (c == 'Z') ||\
+	 (c == 'a') ||\
+	 (c == 'b') ||\
+	 (c == 'c') ||\
+	 (c == 'd') ||\
+	 (c == 'e') ||\
+	 (c == 'f') ||\
+	 (c == 'g') ||\
+	 (c == 'h') ||\
+	 (c == 'i') ||\
+	 (c == 'j') ||\
+	 (c == 'k') ||\
+	 (c == 'l') ||\
+	 (c == 'm') ||\
+	 (c == 'n') ||\
+	 (c == 'o') ||\
+	 (c == 'p') ||\
+	 (c == 'q') ||\
+	 (c == 'r') ||\
+	 (c == 's') ||\
+	 (c == 't') ||\
+	 (c == 'u') ||\
+	 (c == 'v') ||\
+	 (c == 'w') ||\
+	 (c == 'x') ||\
+	 (c == 'y') ||\
+	 (c == 'z'))
+
+#define _is_digit(c)\
+	((c == '0') ||\
+	 (c == '1') ||\
+	 (c == '2') ||\
+	 (c == '3') ||\
+	 (c == '4') ||\
+	 (c == '5') ||\
+	 (c == '6') ||\
+	 (c == '7') ||\
+	 (c == '8') ||\
+	 (c == '9'))
+
+#define _is_gen_delim(c)\
+	((c == ':') ||\
+	 (c == '/') ||\
+	 (c == '?') ||\
+	 (c == '#') ||\
+	 (c == '[') ||\
+	 (c == ']') ||\
+	 (c == '@'))
+
+#define _is_sub_delim(c)\
+	((c == '!') ||\
+	 (c == '$') ||\
+	 (c == '&') ||\
+	 (c == '\'') ||\
+	 (c == '(') ||\
+	 (c == ')') ||\
+	 (c == '*') ||\
+	 (c == '+') ||\
+	 (c == ',') ||\
+	 (c == ';') ||\
+	 (c == '='))
+
+#define _is_unreserved(c)\
+	(_is_alpha(c) ||\
+	 _is_digit(c) ||\
+	 (c == '-')   ||\
+	 (c == '.')   ||\
+	 (c == '_')   ||\
+	 (c == '~'))
+
+#define _is_reserved(c)\
+	(_is_gen_delim(c) ||\
+	 _is_sub_delim(c))
+
+#define _is_pchar(c)\
+	(_is_unreserved(c) ||\
+	 _is_sub_delim(c)  ||\
+	 (c == ':')        ||\
+	 (c == '@'))
+
+#define _is_segment(c)\
+	_is_pchar(c)
+
+#define _is_segment_nc(c)\
+	(_is_unreserved(c) ||\
+	 _is_sub_delim(c)  ||\
+	 (c == '@'))
+
+/*
+ * TODO: I have written code for conversion to and
+ * from strings to abstract numbers representation
+ * that allows base conversion directly to any base
+ * efficiently, I could not find where I kept it,
+ * when I do this hack will be replaced with proper
+ * function.
+ *
+ * This is here because we want to make sure that
+ * when parsing numbers from strings to specify how
+ * long that number is represented in the string, it
+ * uses dynamic memory allocation for the temporary
+ * string which is inefficient.
+ */
+static int
+_strtoi(const char *str, int n, int b)
+{
+	int ret;
+	char *strbuf;
+
+	strbuf = strndup(str, n);
+	if (strbuf == NULL)
+		return -1;
+
+	ret = strtol(strbuf, NULL, b);
+	free(strbuf);
+
+	return ret;
+}
+
+/*
+ * This function could be optimized too.
+ */
+static const char *
+pct_decode(const char *text)
+{
+	int i, x;
+	int buflen;
+	char *buf;
+	char *reallocbuf;
+
+	if (text == NULL)
+		return NULL;
+
+	buflen = strlen(text)+1;
+	buf = strdup(text);
+	if (buf == NULL)
+		return NULL;
+
+	x = 0;
+	i = 0;
+	while (i < buflen) {
+		if (text[i] == '%') {
+			i++;
+			buf[x] = _strtoi(text+i, 2, 16);
+			if (buf[x] == -1) {
+				free(buf);
+				return NULL;
+			}
+			i += 2;
+			x++;
+			continue;
+		}
+		buf[x] = text[i];
+		i++;
+		x++;
+	}
+
+	reallocbuf = realloc(buf, strlen(buf)+1);
+	if (reallocbuf == NULL) {
+		free(buf);
+		return NULL;
+	}
+
+	return reallocbuf;
+}
+
+static int
+_uri_append_path(struct uri *uri, const char *item, int len)
+{
+	char **path;
+	int npath;
+
+	if (uri->npath == 0)
+		npath = 1;
+	else
+		npath = uri->npath + 1;
+
+	path = realloc(uri->path, sizeof(*uri->path)*npath);
+	if (path == NULL)
+		return -1;
+	uri->path = path;
+	uri->path[npath-1] = strndup(item, len);
+	if (uri->path[npath-1] == NULL)
+		return -1;
+	uri->npath = npath;
+
+	return 0;
+}
+
+struct uri *
+uri_decode(const char *text)
+{
+	struct uri *ret;
+	const char *ptr;
+	const char *cpy;
+	const char *dup;
+	int dotctr;
+	int i;
+
+	ret = uri_new();
+	if (ret == NULL)
+		return NULL;
+
+	ptr = text;
+
+	/* look for scheme */
+	if (_is_alpha(*ptr)) {
+		cpy = ptr;
+		ptr++;
+		while (*ptr != '\0' && (_is_alpha(*ptr) || _is_digit(*ptr) || *ptr == '+' || *ptr == '-' || *ptr == '.'))
+			ptr++;
+		if (*ptr == ':') {
+			ret->scheme = strndup(cpy, ptr-cpy);
+			if (ret->scheme == NULL) {
+				free(ret);
+				return NULL;
+			}
+			ptr++;
+		} else {
+			/* not found, rewind */
+			ptr = cpy;
+		}
+	}
+
+	/* there is authority */
+	if (strncmp(ptr, "//", 2) == 0) {
+		ptr += 2;
+
+		/* scan for userinfo */
+		cpy = ptr;
+		while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr) || *ptr == ':')) {
+			ptr++;
+			/* skip pct-encoded */
+			if (*ptr == '%')
+				ptr += 2;
+		}
+		if (*ptr == '@') {
+			ret->authority.user = strndup(cpy, ptr-cpy);
+			if (ret->authority.user == NULL) {
+				uri_free(ret);
+				return NULL;
+			}
+			ptr++;
+		} else {
+			/* not found, reset back */
+			ptr = cpy;
+		}
+
+		/* try IP6 */
+		if (*ptr == '[') {
+			ptr++;
+			cpy = ptr;
+			while (*ptr != '\0' && (_is_digit(*ptr) || _is_alpha(*ptr) || *ptr == ':'))
+				ptr++;
+			if (*ptr != ']') {
+				uri_free(ret);
+				return NULL;
+			}
+			ret->authority.host = strndup(cpy, ptr-cpy);
+			if (ret->authority.host == NULL) {
+				uri_free(ret);
+				return NULL;
+			}
+			ret->authority.type = YURI_HOST_IP6;
+			ptr++;
+		}
+
+		/* not found? try IP4 */
+		if (ret->authority.type == 0) {
+			dotctr = 0;
+			cpy = ptr;
+			while (*ptr != '\0' && (_is_digit(*ptr) || *ptr == '.')) {
+				if (*ptr == '.')
+					dotctr++;
+				ptr++;
+			}
+			if (dotctr == 3) {
+				if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') {
+					uri_free(ret);
+					return NULL;
+				}
+				ret->authority.host = strndup(cpy, ptr-cpy);
+				if (ret->authority.host == NULL) {
+					uri_free(ret);
+					return NULL;
+				}
+				ret->authority.type = YURI_HOST_IP4;
+			} else {
+				/* not and IP4 rewind and try again */
+				ptr = cpy;
+			}
+		}
+
+		/* not found? try IPFuture (not gonna happen) */
+		if (ret->authority.type == 0) {
+			if (*ptr == 'v') {
+				if ((_is_digit(*(ptr+1)) || _is_alpha(*(ptr+1))) &&
+					(_is_digit(*(ptr+2)) || _is_alpha(*(ptr+2))) &&
+					*(ptr+3) == '.') {
+					ptr += 4;
+					cpy = ptr;
+					while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr) || *ptr == ':'))
+						ptr++;
+					if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') {
+						uri_free(ret);
+						return NULL;
+					}
+					ret->authority.host = strndup(cpy, ptr-cpy);
+					if (ret->authority.host == NULL) {
+						uri_free(ret);
+						return NULL;
+					}
+					ret->authority.type = YURI_HOST_IPFUTURE;
+				}
+			}
+		}
+
+		/* not found? try name */
+		if (ret->authority.type == 0) {
+			while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr))) {
+				ptr++;
+				/* skip pct-encoded */
+				if (*ptr == '%')
+					ptr += 2;
+			}
+			if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') {
+				uri_free(ret);
+				return NULL;
+			}
+			ret->authority.host = strndup(cpy, ptr-cpy);
+			if (ret->authority.host == NULL) {
+				uri_free(ret);
+				return NULL;
+			}
+			ret->authority.type = YURI_HOST_NAME;
+		}
+
+		/* host is set, check if there's alternative port */
+		if (ret->authority.host != 0 && *ptr == ':') {
+			ptr++;
+			cpy = ptr;
+			while (*ptr != '\0' && _is_digit(*ptr))
+				ptr++;
+			ret->authority.port = _strtoi(cpy, ptr-cpy, 10);
+			if (ret->authority.port == -1) {
+				uri_free(ret);
+				return NULL;
+			}
+		}
+	}
+
+	/* look for path */
+	if ((ret->authority.host && *ptr == '/') || _is_segment_nc(*ptr)) {
+		do {
+			if (*ptr == '/')
+				ptr++;
+			cpy = ptr;
+			while (*ptr != '\0' && (ret->scheme ? _is_segment(*ptr) : _is_segment_nc(*ptr))) {
+				ptr++;
+				/* skip pct-encoded */
+				if (*ptr == '%')
+					ptr += 2;
+			}
+			if (_uri_append_path(ret, cpy, ptr-cpy) == -1) {
+				uri_free(ret);
+				return NULL;
+			}
+		} while (*ptr != '\0' && *ptr == '/');
+	}
+
+	/* look for query */
+	if (*ptr == '?') {
+		ptr++;
+		cpy = ptr;
+		while (*ptr != '\0' && (_is_pchar(*ptr) || *ptr == '/' || *ptr == '?')) {
+			ptr++;
+			/* skip pct-encoded */
+			if (*ptr == '%')
+				ptr += 2;
+		}
+		ret->query = strndup(cpy, ptr-cpy);
+		if (ret->query == NULL) {
+			uri_free(ret);
+			return NULL;
+		}
+	}
+
+	/* look for fragment */
+	if (*ptr == '#') {
+		ptr++;
+		cpy = ptr;
+		while (*ptr != '\0' && (_is_pchar(*ptr) || *ptr == '/' || *ptr == '?')) {
+			ptr++;
+			/* skip pct-encoded */
+			if (*ptr == '%')
+				ptr += 2;
+		}
+		ret->fragment = strndup(cpy, ptr-cpy);
+		if (ret->fragment == NULL) {
+			uri_free(ret);
+			return NULL;
+		}
+	}
+
+	/* if there is still some trailing text, this is a bug, fail */
+	if (*ptr != '\0') {
+		uri_free(ret);
+		return NULL;
+	}
+
+	/* decode percent encoded characters */
+	if (ret->authority.user) {
+		dup = pct_decode(ret->authority.user);
+		if (dup == NULL) {
+			uri_free(ret);
+			return NULL;
+		}
+		free(ret->authority.user);
+		ret->authority.user = dup;
+	}
+	if (ret->authority.host) {
+		dup = pct_decode(ret->authority.host);
+		if (dup == NULL) {
+			uri_free(ret);
+			return NULL;
+		}
+		free(ret->authority.host);
+		ret->authority.host = dup;
+	}
+	if (ret->npath != 0) {
+		for (i = 0; i < ret->npath; i++) {
+			dup = pct_decode(ret->path[i]);
+			if (dup == NULL) {
+				uri_free(ret);
+				return NULL;
+			}
+			free(ret->path[i]);
+			ret->path[i] = dup;
+		}
+	}
+	if (ret->query) {
+		dup = pct_decode(ret->query);
+		if (dup == NULL) {
+			uri_free(ret);
+			return NULL;
+		}
+		free(ret->query);
+		ret->query = dup;
+	}
+	if (ret->fragment) {
+		dup = pct_decode(ret->fragment);
+		if (dup == NULL) {
+			uri_free(ret);
+			return NULL;
+		}
+		free(ret->fragment);
+		ret->fragment = dup;
+	}
+
+	if (uri_normalize(ret) == -1) {
+		uri_free(ret);
+		return NULL;
+	}
+
+	return ret;
+}