summary refs log tree commit diff stats
path: root/hyp.c
diff options
context:
space:
mode:
authorkaa <kaa@laptosh.my.domain>2023-06-23 11:56:56 -0700
committerkaa <kaa@laptosh.my.domain>2023-06-23 11:56:56 -0700
commit09819bcd940492c8ccc48284880f8bc652a2845a (patch)
tree89f5425d6907b111afd74142866440fba9bced0c /hyp.c
downloadhyp-09819bcd940492c8ccc48284880f8bc652a2845a.tar.gz
Initial. 1.0
Diffstat (limited to 'hyp.c')
-rw-r--r--hyp.c262
1 files changed, 262 insertions, 0 deletions
diff --git a/hyp.c b/hyp.c
new file mode 100644
index 0000000..697fed5
--- /dev/null
+++ b/hyp.c
@@ -0,0 +1,262 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <hyphen.h>
+#include "config.h"
+
+/* Read a tag into a character array and return its length. */
+int
+readtag(char *tag, FILE *in, FILE *out)
+{
+	char ch;
+	int i = 0;
+
+	/* Data after a space in a tag is irrelevant. */
+	while ((ch = fgetc(in)) != '>' && ch != ' ') {
+		fputc(ch, out);
+		tag[i] = ch;
+		++i;
+	}
+
+	if (ch == ' ') {
+		/* Seek to the end of the tag. */
+		while (ch != '>'
+		&& ch != EOF) {
+			fputc(ch, out);
+			ch = fgetc(in);
+		}
+		if (ch == EOF) {
+			return -1;
+		}
+		fputc(ch, out);
+	}
+	else
+		fputc(ch, out);
+
+	tag[i] = '\0';
+	return i;
+}
+
+int
+cmptag(char *tag2, FILE *in, FILE *out)
+{
+	char tag1[MAXWLEN];
+	readtag(tag1, in, out);
+
+	return strcmp(tag1, tag2);
+}
+
+int
+checktag(char *tag, int tagamt, FILE *in, FILE *out)
+{
+	int i;
+	for (i = 0; i < tagamt; ++i)
+		if (strcmp(tag, taglist[i]) == 0)
+			return i;
+	return -1;
+}
+
+/* Check if a character should be skipped. */
+int
+checkskip(char ch, int skiplen)
+{
+	int i;
+	for (i = 0; i < skiplen; ++i)
+		if (ch == skip[i])
+			return 1;
+
+	return 0;
+}
+
+const char *punct = "';.,\"!?:";
+/* Check if a character is punctuation. */
+int
+checkpunct(char ch)
+{
+	int i;
+	for (i = 0; punct[i] != '\0'; ++i)
+		if (ch == punct[i])
+			return 1;
+
+	return 0;
+}
+
+const char *blank = " \n\r\t";
+/* Check if a character is a blank. */
+int
+checkblank(char ch)
+{
+	int i;
+	for (i = 0; blank[i] != '\0'; ++i)
+		if (ch == blank[i])
+			return 1;
+
+	return 0;
+}
+
+/* Loop until the body is found. */
+int
+findbody(FILE *in, FILE *out)
+{
+	char ch;
+	while ((ch = fgetc(in)) != EOF) {
+		fputc(ch, out);
+		if (ch == '<'
+		&& cmptag("body", in, out) == 0)
+			return 1;
+	}
+
+	return 0;
+}
+
+/* Hyphenate a word, by means of hyphen library.
+This is done so as to leverage sufficient hyphenation
+patterns, with the ones used here having been taken
+from those developed for TeX. */
+void
+hypword(char *word, int len, FILE *in, FILE *out, HyphenDict *dict)
+{
+	if (len < MINWLEN) {
+		fprintf(out, "%s", word);
+		return;
+	}
+
+	char *hyphens = calloc(len + 6, sizeof(char));
+	char *hyphword = calloc(len << 1, sizeof(char));
+	char **rep = NULL;
+	int *pos = NULL, *cut = NULL;
+	hnj_hyphen_hyphenate2(dict, word, len, hyphens,
+		hyphword, &rep, &pos, &cut);
+
+/*	fprintf(stderr, "%s\n%s\n%s\n", word, hyphens, hyphword); */
+
+	/* Process the given hyphenation. */
+	int i;
+	char oldch = ' ', oldoldch = '\0';
+	for (i = 0; i < strlen(hyphword); ++i) {
+		if (hyphword[i] == '=' && oldch != ' '
+			&& oldoldch != ' ')
+			fputs("&shy;", out);
+		else if (hyphword[i] != '=')
+			fputc(hyphword[i], out);
+
+		oldoldch = oldch;
+		oldch = hyphword[i];
+	}
+
+	free(hyphens);
+	free(hyphword);
+}
+
+/* Hyphenate the words within a tag. */
+void
+hyptag(FILE *in, FILE *out, int skiplen, char *tag, HyphenDict *dict)
+{
+	char ch, word[MAXWLEN], term[MAXWLEN] = "/";
+	int i = 0;
+
+	strcat(term, tag);
+
+	while ((ch = fgetc(in)) != EOF) {
+		if (i < 0) {
+			fputc(ch, out);
+			++i;
+			if (checkskip(ch, skiplen)) i -= 3;
+			continue;
+		}
+		if (checkblank(ch)) {
+			word[i] = '\0';
+			hypword(word, i, in, out, dict);
+			i = 0;
+		}
+		if (checkpunct(ch)) {
+			word[i] = '\0';
+			hypword(word, i, in, out, dict);
+			fputc(ch, out);
+			i = 0;
+		}
+		else if (checkskip(ch, skiplen)) {
+			word[i] = '\0';
+			fputs(word, out);
+			fputc(ch, out);
+			/* A simple way of working around
+			HTML character codes. Each is 5 ( epsiv )
+			 or 6 ( hellip ) characters long, plus '&' and ';'. */
+			i = -3;
+		}		
+		/* Check for closing tag. */
+		else if (ch == '<') {
+			word[i] = ch;
+			++i;
+			word[i] = '\0';
+			hypword(word, i, in, out, dict);
+			i = 0;
+			readtag(word, in, out);
+			if (strcmp(word, term) == 0) break;
+		}
+		else {
+			word[i] = ch;
+			++i;
+		}
+		if (i == MAXWLEN - 1) {
+			word[i] = '\0';
+			hypword(word, i, in, out, dict);
+			i = 0;
+		}
+	}
+}
+
+/* Hyphenate HTML input via `&shy;'.
+hyp [in] [out] */
+int
+main(int argc, char **argv)
+{
+	FILE *in;
+	if (argc < 2)
+		in = stdin;
+	else {
+		in = fopen(argv[1], "r");
+		if (in == NULL) {
+			printf("%s %s\n", argv[1], "inaccessible.");
+			return 1;
+		}
+	}
+
+	FILE *out;
+	if (argc < 3)
+		out = stdout;
+	else {
+		out = fopen(argv[2], "w");
+		if (out == NULL) {
+			printf("%s %s\n", argv[2], "inaccessible.");
+			return 2;
+		}
+	}
+
+	if (findbody(in, out) == 0) {
+		puts("There is no body.");
+		return 3;
+	}
+
+	HyphenDict *dict = hnj_hyphen_load(dictfile);
+	if (dict == NULL) {
+		puts("Dict not readable.");
+		return 4;
+	}
+	dict->utf8 = 1;
+
+	int tagamt = 0;
+	while (taglist[tagamt][0] != '\0')
+		++tagamt;
+	int skiplen = strlen(skip);
+
+	char ch, tag[MAXWLEN];
+	int len;
+	while ((ch = fgetc(in)) != EOF) {
+		fputc(ch, out);
+		if (ch == '<' && (len = readtag(tag, in, out)) > 0
+			&& checktag(tag, tagamt, in, out) != -1)
+			hyptag(in, out, skiplen, tag, dict);
+	}
+	return 0;
+}