#include #include #include "yuri.h" /* * for an explanation of this hell please refer to * RFC 3986 (Appendix A) */ #define _is_alpha(c)\ ((c == 'A') ||\ (c == 'B') ||\ (c == 'C') ||\ (c == 'D') ||\ (c == 'E') ||\ (c == 'F') ||\ (c == 'G') ||\ (c == 'H') ||\ (c == 'I') ||\ (c == 'J') ||\ (c == 'K') ||\ (c == 'L') ||\ (c == 'M') ||\ (c == 'N') ||\ (c == 'O') ||\ (c == 'P') ||\ (c == 'Q') ||\ (c == 'R') ||\ (c == 'S') ||\ (c == 'T') ||\ (c == 'U') ||\ (c == 'V') ||\ (c == 'W') ||\ (c == 'X') ||\ (c == 'Y') ||\ (c == 'Z') ||\ (c == 'a') ||\ (c == 'b') ||\ (c == 'c') ||\ (c == 'd') ||\ (c == 'e') ||\ (c == 'f') ||\ (c == 'g') ||\ (c == 'h') ||\ (c == 'i') ||\ (c == 'j') ||\ (c == 'k') ||\ (c == 'l') ||\ (c == 'm') ||\ (c == 'n') ||\ (c == 'o') ||\ (c == 'p') ||\ (c == 'q') ||\ (c == 'r') ||\ (c == 's') ||\ (c == 't') ||\ (c == 'u') ||\ (c == 'v') ||\ (c == 'w') ||\ (c == 'x') ||\ (c == 'y') ||\ (c == 'z')) #define _is_digit(c)\ ((c == '0') ||\ (c == '1') ||\ (c == '2') ||\ (c == '3') ||\ (c == '4') ||\ (c == '5') ||\ (c == '6') ||\ (c == '7') ||\ (c == '8') ||\ (c == '9')) #define _is_gen_delim(c)\ ((c == ':') ||\ (c == '/') ||\ (c == '?') ||\ (c == '#') ||\ (c == '[') ||\ (c == ']') ||\ (c == '@')) #define _is_sub_delim(c)\ ((c == '!') ||\ (c == '$') ||\ (c == '&') ||\ (c == '\'') ||\ (c == '(') ||\ (c == ')') ||\ (c == '*') ||\ (c == '+') ||\ (c == ',') ||\ (c == ';') ||\ (c == '=')) #define _is_unreserved(c)\ (_is_alpha(c) ||\ _is_digit(c) ||\ (c == '-') ||\ (c == '.') ||\ (c == '_') ||\ (c == '~')) #define _is_reserved(c)\ (_is_gen_delim(c) ||\ _is_sub_delim(c)) #define _is_pchar(c)\ (_is_unreserved(c) ||\ _is_sub_delim(c) ||\ (c == ':') ||\ (c == '@')) #define _is_segment(c)\ _is_pchar(c) #define _is_segment_nc(c)\ (_is_unreserved(c) ||\ _is_sub_delim(c) ||\ (c == '@')) /* * TODO: I have written code for conversion to and * from strings to abstract numbers representation * that allows base conversion directly to any base * efficiently, I could not find where I kept it, * when I do this hack will be replaced with proper * function. * * This is here because we want to make sure that * when parsing numbers from strings to specify how * long that number is represented in the string, it * uses dynamic memory allocation for the temporary * string which is inefficient. */ static int _strtoi(const char *str, int n, int b) { int ret; char *strbuf; strbuf = strndup(str, n); if (strbuf == NULL) return -1; ret = strtol(strbuf, NULL, b); free(strbuf); return ret; } /* * This function could be optimized too. */ static const char * pct_decode(const char *text) { int i, x; int buflen; char *buf; char *reallocbuf; if (text == NULL) return NULL; buflen = strlen(text)+1; buf = strdup(text); if (buf == NULL) return NULL; x = 0; i = 0; while (i < buflen) { if (text[i] == '%') { i++; buf[x] = _strtoi(text+i, 2, 16); if (buf[x] == -1) { free(buf); return NULL; } i += 2; x++; continue; } buf[x] = text[i]; i++; x++; } reallocbuf = realloc(buf, strlen(buf)+1); if (reallocbuf == NULL) { free(buf); return NULL; } return reallocbuf; } static int _uri_append_path(struct uri *uri, const char *item, int len) { char **path; int npath; if (uri->npath == 0) npath = 1; else npath = uri->npath + 1; path = realloc(uri->path, sizeof(*uri->path)*npath); if (path == NULL) return -1; uri->path = path; uri->path[npath-1] = strndup(item, len); if (uri->path[npath-1] == NULL) return -1; uri->npath = npath; return 0; } struct uri * uri_decode(const char *text) { struct uri *ret; const char *ptr; const char *cpy; const char *dup; int dotctr; int i; ret = uri_new(); if (ret == NULL) return NULL; ptr = text; /* look for scheme */ if (_is_alpha(*ptr)) { cpy = ptr; ptr++; while (*ptr != '\0' && (_is_alpha(*ptr) || _is_digit(*ptr) || *ptr == '+' || *ptr == '-' || *ptr == '.')) ptr++; if (*ptr == ':') { ret->scheme = strndup(cpy, ptr-cpy); if (ret->scheme == NULL) { free(ret); return NULL; } ptr++; } else { /* not found, rewind */ ptr = cpy; } } /* there is authority */ if (strncmp(ptr, "//", 2) == 0) { ptr += 2; /* scan for userinfo */ cpy = ptr; while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr) || *ptr == ':')) { ptr++; /* skip pct-encoded */ if (*ptr == '%') ptr += 2; } if (*ptr == '@') { ret->authority.user = strndup(cpy, ptr-cpy); if (ret->authority.user == NULL) { uri_free(ret); return NULL; } ptr++; } else { /* not found, reset back */ ptr = cpy; } /* try IP6 */ if (*ptr == '[') { ptr++; cpy = ptr; while (*ptr != '\0' && (_is_digit(*ptr) || _is_alpha(*ptr) || *ptr == ':')) ptr++; if (*ptr != ']') { uri_free(ret); return NULL; } ret->authority.host = strndup(cpy, ptr-cpy); if (ret->authority.host == NULL) { uri_free(ret); return NULL; } ret->authority.type = YURI_HOST_IP6; ptr++; } /* not found? try IP4 */ if (ret->authority.type == 0) { dotctr = 0; cpy = ptr; while (*ptr != '\0' && (_is_digit(*ptr) || *ptr == '.')) { if (*ptr == '.') dotctr++; ptr++; } if (dotctr == 3) { if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') { uri_free(ret); return NULL; } ret->authority.host = strndup(cpy, ptr-cpy); if (ret->authority.host == NULL) { uri_free(ret); return NULL; } ret->authority.type = YURI_HOST_IP4; } else { /* not and IP4 rewind and try again */ ptr = cpy; } } /* not found? try IPFuture (not gonna happen) */ if (ret->authority.type == 0) { if (*ptr == 'v') { if ((_is_digit(*(ptr+1)) || _is_alpha(*(ptr+1))) && (_is_digit(*(ptr+2)) || _is_alpha(*(ptr+2))) && *(ptr+3) == '.') { ptr += 4; cpy = ptr; while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr) || *ptr == ':')) ptr++; if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') { uri_free(ret); return NULL; } ret->authority.host = strndup(cpy, ptr-cpy); if (ret->authority.host == NULL) { uri_free(ret); return NULL; } ret->authority.type = YURI_HOST_IPFUTURE; } } } /* not found? try name */ if (ret->authority.type == 0) { while (*ptr != '\0' && (_is_unreserved(*ptr) || _is_sub_delim(*ptr))) { ptr++; /* skip pct-encoded */ if (*ptr == '%') ptr += 2; } if (*ptr != '\0' && *ptr != ':' && *ptr != '/' && *ptr != '?' && *ptr != '#') { uri_free(ret); return NULL; } ret->authority.host = strndup(cpy, ptr-cpy); if (ret->authority.host == NULL) { uri_free(ret); return NULL; } ret->authority.type = YURI_HOST_NAME; } /* host is set, check if there's alternative port */ if (ret->authority.host != 0 && *ptr == ':') { ptr++; cpy = ptr; while (*ptr != '\0' && _is_digit(*ptr)) ptr++; ret->authority.port = _strtoi(cpy, ptr-cpy, 10); if (ret->authority.port == -1) { uri_free(ret); return NULL; } } } /* look for path */ if ((ret->authority.host && *ptr == '/') || _is_segment_nc(*ptr)) { do { if (*ptr == '/') ptr++; cpy = ptr; while (*ptr != '\0' && (ret->scheme ? _is_segment(*ptr) : _is_segment_nc(*ptr))) { ptr++; /* skip pct-encoded */ if (*ptr == '%') ptr += 2; } if (_uri_append_path(ret, cpy, ptr-cpy) == -1) { uri_free(ret); return NULL; } } while (*ptr != '\0' && *ptr == '/'); } /* look for query */ if (*ptr == '?') { ptr++; cpy = ptr; while (*ptr != '\0' && (_is_pchar(*ptr) || *ptr == '/' || *ptr == '?')) { ptr++; /* skip pct-encoded */ if (*ptr == '%') ptr += 2; } ret->query = strndup(cpy, ptr-cpy); if (ret->query == NULL) { uri_free(ret); return NULL; } } /* look for fragment */ if (*ptr == '#') { ptr++; cpy = ptr; while (*ptr != '\0' && (_is_pchar(*ptr) || *ptr == '/' || *ptr == '?')) { ptr++; /* skip pct-encoded */ if (*ptr == '%') ptr += 2; } ret->fragment = strndup(cpy, ptr-cpy); if (ret->fragment == NULL) { uri_free(ret); return NULL; } } /* if there is still some trailing text, this is a bug, fail */ if (*ptr != '\0') { uri_free(ret); return NULL; } /* decode percent encoded characters */ if (ret->authority.user) { dup = pct_decode(ret->authority.user); if (dup == NULL) { uri_free(ret); return NULL; } free(ret->authority.user); ret->authority.user = dup; } if (ret->authority.host) { dup = pct_decode(ret->authority.host); if (dup == NULL) { uri_free(ret); return NULL; } free(ret->authority.host); ret->authority.host = dup; } if (ret->npath != 0) { for (i = 0; i < ret->npath; i++) { dup = pct_decode(ret->path[i]); if (dup == NULL) { uri_free(ret); return NULL; } free(ret->path[i]); ret->path[i] = dup; } } if (ret->query) { dup = pct_decode(ret->query); if (dup == NULL) { uri_free(ret); return NULL; } free(ret->query); ret->query = dup; } if (ret->fragment) { dup = pct_decode(ret->fragment); if (dup == NULL) { uri_free(ret); return NULL; } free(ret->fragment); ret->fragment = dup; } if (uri_normalize(ret) == -1) { uri_free(ret); return NULL; } return ret; }