about summary refs log tree commit diff stats
path: root/tldlist.c
diff options
context:
space:
mode:
authorElias Norberg <xyzzy@kudzu.se>2012-01-12 00:17:33 +0100
committerElias Norberg <xyzzy@kudzu.se>2012-01-16 19:51:27 +0100
commitbc81e88b13c0ddfa385a10cfbe2ea93259c7f8a4 (patch)
tree9ef03ce4c6316b5b15484a6ca08d582c319fad3d /tldlist.c
parenta8a93c2b7cae6c1d217bca596d8b0b631efe97b7 (diff)
downloadxombrero-bc81e88b13c0ddfa385a10cfbe2ea93259c7f8a4.tar.gz
Use a list of TLD's to properly handle multi-part TLD's
The list used is from publicsuffix.org. It gets installed
to the resource-directory as 'tld-rules'
Diffstat (limited to 'tldlist.c')
-rw-r--r--tldlist.c442
1 files changed, 442 insertions, 0 deletions
diff --git a/tldlist.c b/tldlist.c
new file mode 100644
index 0000000..4485636
--- /dev/null
+++ b/tldlist.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012 Elias Norberg <xyzzy@kudzu.se>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "xxxterm.h"
+
+#define TLD_TREE_END_NODE 1
+#define TLD_TREE_EXCEPTION 2
+
+struct tld_tree_node {
+	struct tld_tree_node	*next;
+	struct tld_tree_node	*child;
+	const char		*lbl;
+	char			flags;
+};
+
+struct tld_tree_node tld_tree_root = { NULL, NULL, "" };
+
+#define TREE_INSERT_CHILD(n, data) \
+		n->child = g_malloc(sizeof *n); \
+		n->child->next = NULL; \
+		n->child->child = NULL; \
+		n->child->flags = 0; \
+		n->child->lbl = data;
+
+#define TREE_INSERT_NEXT(n, data) \
+		n->next = g_malloc(sizeof *n); \
+		n->next->next = NULL; \
+		n->next->child = NULL; \
+		n->next->flags = 0; \
+		n->next->lbl = data;
+
+#define P_BASE		(36)
+#define P_TMIN		(1)
+#define P_TMAX		(26)
+#define P_SKEW		(38)
+#define P_DAMP		(700)
+#define INITIAL_BIAS	(72)
+#define INITIAL_N	(128)
+
+int
+adapt(int delta, int numpoints, int firsttime)
+{
+	int		k;
+
+	if (firsttime)
+		delta = delta / P_DAMP;
+	else
+		delta = delta / 2;
+
+	delta += (delta / numpoints);
+
+	k = 0;
+	while (delta > (((P_BASE - P_TMIN) * P_TMAX) / 2)) {
+		delta = delta / (P_BASE - P_TMIN);
+		k += P_BASE;
+	}
+
+	k += (((P_BASE - P_TMIN + 1) * delta) / (delta + P_SKEW));
+	return (k);
+}
+
+int
+get_minimum_char(char *str, int n)
+{
+	gunichar ch = 0;
+	gunichar min = 0xffffff;
+
+	for(; *str; str = g_utf8_next_char(str)) {
+		ch = g_utf8_get_char(str);
+		if (ch >= n && ch < min)
+			min = ch;
+	}
+
+	return (min);
+}
+
+char
+encode_digit(int n)
+{
+	if (n < 26)
+		return n + 'a';
+	return (n-26) + '0';
+}
+
+char *
+punycode_encode(char *str)
+{
+	char		output[1024];
+	char		*s;
+	gunichar	c;
+	int		need_coding = 0;
+	int		l, len, i;
+
+	int		n = INITIAL_N;
+	int		delta = 0;
+	int		bias = INITIAL_BIAS;
+	int		h, b, m, k, t, q;
+
+	l = 0;
+	for (s=str; *s; s = g_utf8_next_char(s)) {
+		c = g_utf8_get_char(s);
+		if (c < 128)
+			output[l++] = *s;
+		else
+			need_coding = 1;
+	}
+
+	output[l] = '\0';
+
+	if (!need_coding)
+		return g_strdup(output);
+
+	h = b = strlen(output);
+
+	if (l > 0)
+		output[l++] = '-';
+	output[l] = '\0';
+
+	len = g_utf8_strlen(str, -1);
+	while (h < len) {
+		m = get_minimum_char(str, n);
+		delta += (m - n) * (h + 1);
+		n = m;
+		for (s=str; *s; s = g_utf8_next_char(s)) {
+			c = g_utf8_get_char(s);
+
+			if (c < n) delta ++;
+			if (c == n) {
+				q = delta;
+				for (k=P_BASE;; k+=P_BASE) {
+					if (k <= bias)
+						t = P_TMIN;
+					else if(k >= bias + P_TMAX)
+						t = P_TMAX;
+					else
+						t = k - bias;
+
+					if (q < t)
+						break;
+
+					output[l++] = encode_digit(t+((q-t)%(P_BASE-t)));
+					q = (q - t) / (P_BASE - t);
+				}
+				output[l++] = encode_digit(q);
+				bias = adapt(delta, h + 1, h == b);
+				delta = 0;
+				h ++;
+			}
+		}
+		delta ++;
+		n ++;
+	}
+
+	output[l] = '\0';
+	for (i=l+4;i>=4;i--)
+		output[i] = output[i-4];
+	l += 4;
+	output[0] = 'x';
+	output[1] = 'n';
+	output[2] = '-';
+	output[3] = '-';
+	output[l] = '\0';
+	return g_strdup(output);
+}
+
+/*
+ * strrchr2(str, saveptr, ch)
+ *
+ * Walk backwards through str, jumping to next 'ch'
+ * On first call, *saveptr should be set to NULL.
+ * On following calls, supply the same saveptr.
+ *
+ * Returns NULL when the whole string 'str' has been
+ * looped through. Otherwise returns the position
+ * before the next 'ch'.
+ */
+const char *
+strrchr2(const char *str, const char **saveptr, int ch)
+{
+	const char		*ptr;
+
+	if (str != NULL && *saveptr == NULL) {
+		*saveptr = str + strlen(str);
+	} else if (str == *saveptr) {
+		return (NULL);
+	}
+
+	for (ptr= *saveptr - 1; ptr != str && *ptr != ch; ptr--)
+		;
+
+	*saveptr = ptr;
+	if (ptr != str)
+		return (ptr+1);
+	return (ptr);
+}
+
+/*
+ * tld_tree_add(rule)
+ *
+ * Adds a tld-rule to the tree
+ */
+void
+tld_tree_add(const char *rule)
+{
+	struct tld_tree_node	*n;
+	const char		*lbl;
+	const char		*saveptr;
+
+	saveptr = NULL;
+	lbl = strrchr2(rule, &saveptr, '.');
+
+	for (n = &tld_tree_root; lbl != NULL;) {
+
+		if (strcmp(n->lbl, lbl) == 0) {
+			lbl = strrchr2(rule, &saveptr, '.');
+
+			if (!n->child)
+				break;
+
+			n = n->child;
+			continue;
+		}
+
+		if (n->next == NULL) {
+			TREE_INSERT_NEXT(n, lbl);
+			n = n->next;
+
+			lbl = strrchr2(rule, &saveptr, '.');
+			break;
+		}
+		n = n->next;
+	}
+
+	while (lbl) {
+		TREE_INSERT_CHILD(n, lbl);
+
+		lbl = strrchr2(rule, &saveptr, '.');
+		n = n->child;
+	}
+
+	n->flags |= TLD_TREE_END_NODE;
+	if (n->lbl[0] == '!') {
+		n->flags |= TLD_TREE_EXCEPTION;
+		n->lbl ++;
+	}
+}
+
+void
+tld_tree_init()
+{
+	FILE		*fd;
+	char		buf[1024];
+	char		file[PATH_MAX];
+	char		*ptr, *next_lbl;
+	char		*enc_lbl;
+	char		*rule, *rp;
+	char		extra_ch;
+
+	snprintf(file, sizeof file, "%s/tld-rules", resource_dir);
+	fd = fopen(file, "r");
+	if (fd == NULL) {
+		/* a poor replacement for the real list - but it's
+		 * better than nothing.
+		 */
+		tld_tree_add("*");
+		return;
+	}
+
+	for (;;) {
+		ptr = fgets(buf, sizeof buf, fd);
+		if (ptr == NULL || feof(fd))
+			break;
+
+		/* skip comments */
+		if ((ptr = strstr(buf, "//")) != NULL)
+			*ptr = '\0';
+		/* skip anything after space or tab */
+		for (ptr = buf; *ptr; ptr++)
+			if (*ptr == ' ' || *ptr == '\t' ||
+			    *ptr == '\n' || *ptr == '\r')
+				break;
+		*ptr = '\0';
+
+		if (!strlen(buf))
+			continue;
+
+		extra_ch = 0;
+		ptr = buf;
+		if (buf[0] == '!' && buf[0] == '*') {
+			extra_ch = buf[0];
+			ptr ++;
+		}
+
+
+		rule = NULL;
+		/* split into labels, and convert them one by one */
+		for (;;) {
+			if ((next_lbl = strchr(ptr, '.')))
+				*next_lbl = '\0';
+
+			enc_lbl = punycode_encode(ptr);
+			if (rule) {
+				rp = rule;
+				rule = g_strdup_printf("%s%s%s", rp, enc_lbl,
+				    next_lbl ? "." : "");
+				g_free(rp);
+				g_free(enc_lbl);
+			} else {
+				rule = g_strdup_printf("%.1s%s%s",
+				    extra_ch ? buf:"", enc_lbl,
+				    next_lbl ? ".":"");
+				g_free(enc_lbl);
+			}
+
+			if (!next_lbl)
+				break;
+			ptr = next_lbl + 1;
+		}
+		tld_tree_add(rule);
+	}
+
+	fclose(fd);
+}
+
+/*
+ * tld_get_suffix(domain)
+ *
+ * Find the public suffix for domain.
+ *
+ * Returns a pointer to the suffix position
+ * in domain, or NULL if no public suffix
+ * was found.
+ */
+char *
+tld_get_suffix(const char *domain)
+{
+	struct tld_tree_node	*n;
+	const char		*suffix;
+	const char		*lbl, *saveptr;
+	const char		*tmp_saveptr, *tmp_lbl;
+
+	if (domain == NULL)
+		return (NULL);
+	if (domain[0] == '.')
+		return (NULL);
+
+	saveptr = NULL;
+	suffix = NULL;
+	lbl = strrchr2(domain, &saveptr, '.');
+
+	for (n = &tld_tree_root; n != NULL && lbl != NULL;) {
+
+		if (!strlen(n->lbl)) {
+			n = n->next;
+			continue;
+		}
+
+		if (n->lbl[0] == '*') {
+			if (n->flags & TLD_TREE_END_NODE) {
+
+				tmp_saveptr = saveptr;
+				tmp_lbl = lbl;
+
+				lbl = strrchr2(domain, &saveptr, '.');
+
+				/* Save possible public suffix */
+				suffix = lbl;
+				saveptr = tmp_saveptr;
+				lbl = tmp_lbl;
+			}
+
+			n = n->next;
+			continue;
+		}
+
+		if (strcmp(n->lbl, lbl) == 0) {
+			if (n->flags & TLD_TREE_EXCEPTION) {
+				/* We're done looking */
+				suffix = lbl;
+				break;
+			}
+
+			lbl = strrchr2(domain, &saveptr, '.');
+
+			/* Possible public suffix - other rules might
+			 * still apply
+			 */
+			if (n->flags & TLD_TREE_END_NODE)
+				suffix = lbl;
+
+			/* Domain too short */
+			if (lbl == NULL) {
+				/* Check if we have a child that is '*' */
+				for (n = n->child; n; n = n->next)
+					if (n->lbl[0] == '*')
+						suffix = NULL;
+				break;
+			}
+
+			if (n->child == NULL)
+				break;
+
+			n = n->child;
+			continue;
+		}
+
+		if (n->next == NULL)
+			break;
+		n = n->next;
+	}
+
+	/* If we can't find a matching suffix, it can mean that either
+	 * a) the user is surfing a local prefix
+	 * b) the list is not properly updated
+	 *
+	 * In any case - in order not to break stuff while surfing
+	 * new TLD's, we return the public suffix as the top 2 labels
+	 *
+	 * www.abc.xyz therefore has public suffix 'abc.xyz'
+	 */
+	if (!suffix) {
+		saveptr = NULL;
+		lbl = strrchr2(domain, &saveptr, '.');
+		lbl = strrchr2(domain, &saveptr, '.');
+		suffix = lbl;
+	}
+
+	return ((char*)suffix);
+}