#include <stdlib.h>
#include <string.h>
#include "yuri.h"
/*
* for an explanation of this hell please refer to
* RFC 3986 (Appendix A)
*/
#define _is_alpha(c)\
((c == 'A') ||\
(c == 'B') ||\
(c == 'C') ||\
(c == 'D') ||\
(c == 'E') ||\
(c == 'F') ||\
(c == 'G') ||\
(c == 'H') ||\
(c == 'I') ||\
(c == 'J') ||\
(c == 'K') ||\
(c == 'L') ||\
(c == 'M') ||\
(c == 'N') ||\
(c == 'O') ||\
(c == 'P') ||\
(c == 'Q') ||\
(c == 'R') ||\
(c == 'S') ||\
(c == 'T') ||\
(c == 'U') ||\
(c == 'V') ||\
(c == 'W') ||\
(c == 'X') ||\
(c == 'Y') ||\
(c == 'Z') ||\
(c == 'a') ||\
(c == 'b') ||\
(c == 'c') ||\
(c == 'd') ||\
(c == 'e') ||\
(c == 'f') ||\
(c == 'g') ||\
(c == 'h') ||\
(c == 'i') ||\
(c == 'j') ||\
(c == 'k') ||\
(c == 'l') ||\
(c == 'm') ||\
(c == 'n') ||\
(c == 'o') ||\
(c == 'p') ||\
(c == 'q') ||\
(c == 'r') ||\
(c == 's') ||\
(c == 't') ||\
(c == 'u') ||\
(c == 'v') ||\
(c == 'w') ||\
(c == 'x') ||\
(c == 'y') ||\
(c == 'z'))
#define _is_digit(c)\
((c == '0') ||\
(c == '1') ||\
(c == '2') ||\
(c == '3') ||\
(c == '4') ||\
(c == '5') ||\
(c == '6') ||\
(c == '7') ||\
(c == '8') ||\
(c == '9'))
#define _is_gen_delim(c)\
((c == ':') ||\
(c == '/') ||\
(c == '?') ||\
(c == '#') ||\
(c == '[') ||\
(c == ']') ||\
(c == '@'))
#define _is_sub_delim(c)\
((c == '!') ||\
(c == '$') ||\
(c == '&') ||\
(c == '\'') ||\
(c == '(') ||\
(c == ')') ||\
(c == '*') ||\
(c == '+') ||\
(c == ',') ||\
(c == ';') ||\
(c == '='))
#define _is_unreserved(c)\
(_is_alpha(c) ||\
_is_digit(c) ||\
(c == '-') ||\
(c == '.') ||\
(c == '_') ||\
(c == '~'))
#define _is_reserved(c)\
(_is_gen_delim(c) ||\
_is_sub_delim(c))
#define _is_pchar(c)\
(_is_unreserved(c) ||\
_is_sub_delim(c) ||\
(c == ':') ||\
(c == '@'))
#define _is_segment(c)\
_is_pchar(c)
#define _is_segment_nc(c)\
(_is_unreserved(c) ||\
_is_sub_delim(c) ||\
(c == '@'))
/*
* TODO: I have written code for conversion to and
* from strings to abstract numbers representation
* that allows base conversion directly to any base
* efficiently, I could not find where I kept it,
* when I do this hack will be replaced with proper
* function.
*
* This is here because we want to make sure that
* when parsing numbers from strings to specify how
* long that number is represented in the string, it
* uses dynamic memory allocation for the temporary
* string which is inefficient.
*/
static int
_strtoi(const char *str, int n, int b)
{
int ret;
char *strbuf;
strbuf = strndup(str, n);
if (strbuf == NULL)
return -1;
ret = strtol(strbuf, NULL, b);
free(strbuf);
return ret;
}
/*
* This function could be optimized too.
*/
static const char *
pct_decode(const char *text)
{
int i, x;
int buflen;
char *buf;
char *reallocbuf;
if (text == NULL)
return NULL;
buflen = strlen(text)+1;
buf = strdup(text);
if (buf == NULL)
return NULL;
x = 0;
i = 0;
while (i < buflen) {
if (text[i] == '%') {
i++;
buf[x] = _strtoi(text+i, 2, 16);
if (buf[x] == -1) {
free(buf);
return NULL;
}
i += 2;
x++;
continue;
}
buf[x] = text[i];
i++;
x++;
}
reallocbuf = realloc(buf, strlen(buf)+1);
if (reallocbuf == NULL) {
free(buf);
return NULL;
}
return reallocbuf;
}
static int
_uri_append_path(struct uri *uri, const char *item, int len)
{
char **path;
int npath;
if (uri->npath == 0)
npath = 1;
else
npath = uri->npath + 1;
path = realloc(uri->path, sizeof(*uri->path)*npath);
if (path == NULL)
return -1;
uri->path = path;
uri->path[npath-1] = strndup(item, len);
if (uri->path[npath-1] == NULL)
return -1;
uri->npath = npath;
return 0;
}
struct uri *
uri_decode(const char *text)
{
struct uri *ret;
const char *ptr;
const char *cpy;
const char *dup;
int dotctr;
int i;
ret = uri_new();
if (ret == NULL)
return NULL;
ptr = text;
/* look for scheme */
if (_is_alpha(*ptr)) {
cpy = ptr;
ptr++;
while (*ptr != '\0' && (_is_alpha(*ptr) || _is_digit(*ptr) || *ptr == '+' || *ptr == '-' || *ptr == '.'))
ptr++;
if (*ptr == ':') {
ret->scheme = strndup(cpy, ptr-cpy);
if (ret->scheme == NULL) {
free(ret);
return NULL;
}
ptr++;
} else {
/* not found, rewind */
ptr = cpy;
}
}
/* there is authority */
if (strncmp(ptr, "//", 2) == 0) {
ptr += 2;
/* scan for userinfo */
cpy = ptr;
while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr) || *ptr == ':')) {
ptr++;
/* skip pct-encoded */
if (*ptr == '%')
ptr += 2;
}
if (*ptr == '@') {
ret->authority.user = strndup(cpy, ptr-cpy);
if (ret->authority.user == NULL) {
uri_free(ret);
return NULL;
}
ptr++;
} else {
/* not found, reset back */
ptr = cpy;
}
/* try IP6 */
if (*ptr == '[') {
ptr++;
cpy = ptr;
while (*ptr != '\0' && (_is_digit(*ptr) || _is_alpha(*ptr) || *ptr == ':'))
ptr++;
if (*ptr != ']') {
uri_free(ret);
return NULL;
}
ret->authority.host = strndup(cpy, ptr-cpy);
if (ret->authority.host == NULL) {
uri_free(ret);
return NULL;
}
ret->authority.type = YURI_HOST_IP6;
ptr++;
}
/* not found? try IP4 */
if (ret->authority.type == 0) {
dotctr = 0;
cpy = ptr;
while (*ptr != '\0' && (_is_digit(*ptr) || *ptr == '.')) {
if (*ptr == '.')
dotctr++;
ptr++;
}
if (dotctr == 3) {
if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') {
uri_free(ret);
return NULL;
}
ret->authority.host = strndup(cpy, ptr-cpy);
if (ret->authority.host == NULL) {
uri_free(ret);
return NULL;
}
ret->authority.type = YURI_HOST_IP4;
} else {
/* not and IP4 rewind and try again */
ptr = cpy;
}
}
/* not found? try IPFuture (not gonna happen) */
if (ret->authority.type == 0) {
if (*ptr == 'v') {
if ((_is_digit(*(ptr+1)) || _is_alpha(*(ptr+1))) &&
(_is_digit(*(ptr+2)) || _is_alpha(*(ptr+2))) &&
*(ptr+3) == '.') {
ptr += 4;
cpy = ptr;
while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr) || *ptr == ':'))
ptr++;
if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') {
uri_free(ret);
return NULL;
}
ret->authority.host = strndup(cpy, ptr-cpy);
if (ret->authority.host == NULL) {
uri_free(ret);
return NULL;
}
ret->authority.type = YURI_HOST_IPFUTURE;
}
}
}
/* not found? try name */
if (ret->authority.type == 0) {
while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr))) {
ptr++;
/* skip pct-encoded */
if (*ptr == '%')
ptr += 2;
}
if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') {
uri_free(ret);
return NULL;
}
ret->authority.host = strndup(cpy, ptr-cpy);
if (ret->authority.host == NULL) {
uri_free(ret);
return NULL;
}
ret->authority.type = YURI_HOST_NAME;
}
/* host is set, check if there's alternative port */
if (ret->authority.host != 0 && *ptr == ':') {
ptr++;
cpy = ptr;
while (*ptr != '\0' && _is_digit(*ptr))
ptr++;
ret->authority.port = _strtoi(cpy, ptr-cpy, 10);
if (ret->authority.port == -1) {
uri_free(ret);
return NULL;
}
}
}
/* look for path */
if ((ret->authority.host && *ptr == '/') || _is_segment_nc(*ptr)) {
do {
if (*ptr == '/')
ptr++;
cpy = ptr;
while (*ptr != '\0' && (ret->scheme ? _is_segment(*ptr) : _is_segment_nc(*ptr))) {
ptr++;
/* skip pct-encoded */
if (*ptr == '%')
ptr += 2;
}
if (_uri_append_path(ret, cpy, ptr-cpy) == -1) {
uri_free(ret);
return NULL;
}
} while (*ptr != '\0' && *ptr == '/');
}
/* look for query */
if (*ptr == '?') {
ptr++;
cpy = ptr;
while (*ptr != '\0' && (_is_pchar(*ptr) || *ptr == '/' || *ptr == '?')) {
ptr++;
/* skip pct-encoded */
if (*ptr == '%')
ptr += 2;
}
ret->query = strndup(cpy, ptr-cpy);
if (ret->query == NULL) {
uri_free(ret);
return NULL;
}
}
/* look for fragment */
if (*ptr == '#') {
ptr++;
cpy = ptr;
while (*ptr != '\0' && (_is_pchar(*ptr) || *ptr == '/' || *ptr == '?')) {
ptr++;
/* skip pct-encoded */
if (*ptr == '%')
ptr += 2;
}
ret->fragment = strndup(cpy, ptr-cpy);
if (ret->fragment == NULL) {
uri_free(ret);
return NULL;
}
}
/* if there is still some trailing text, this is a bug, fail */
if (*ptr != '\0') {
uri_free(ret);
return NULL;
}
/* decode percent encoded characters */
if (ret->authority.user) {
dup = pct_decode(ret->authority.user);
if (dup == NULL) {
uri_free(ret);
return NULL;
}
free(ret->authority.user);
ret->authority.user = dup;
}
if (ret->authority.host) {
dup = pct_decode(ret->authority.host);
if (dup == NULL) {
uri_free(ret);
return NULL;
}
free(ret->authority.host);
ret->authority.host = dup;
}
if (ret->npath != 0) {
for (i = 0; i < ret->npath; i++) {
dup = pct_decode(ret->path[i]);
if (dup == NULL) {
uri_free(ret);
return NULL;
}
free(ret->path[i]);
ret->path[i] = dup;
}
}
if (ret->query) {
dup = pct_decode(ret->query);
if (dup == NULL) {
uri_free(ret);
return NULL;
}
free(ret->query);
ret->query = dup;
}
if (ret->fragment) {
dup = pct_decode(ret->fragment);
if (dup == NULL) {
uri_free(ret);
return NULL;
}
free(ret->fragment);
ret->fragment = dup;
}
if (uri_normalize(ret) == -1) {
uri_free(ret);
return NULL;
}
return ret;
}