summary refs log blame commit diff stats
path: root/hyp.c
blob: 697fed5b27ae728a2e9234047d00f39891713e7a (plain) (tree)





































































































































































































































































                                                                              
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <hyphen.h>
#include "config.h"

/* Read a tag into a character array and return its length. */
int
readtag(char *tag, FILE *in, FILE *out)
{
	char ch;
	int i = 0;

	/* Data after a space in a tag is irrelevant. */
	while ((ch = fgetc(in)) != '>' && ch != ' ') {
		fputc(ch, out);
		tag[i] = ch;
		++i;
	}

	if (ch == ' ') {
		/* Seek to the end of the tag. */
		while (ch != '>'
		&& ch != EOF) {
			fputc(ch, out);
			ch = fgetc(in);
		}
		if (ch == EOF) {
			return -1;
		}
		fputc(ch, out);
	}
	else
		fputc(ch, out);

	tag[i] = '\0';
	return i;
}

int
cmptag(char *tag2, FILE *in, FILE *out)
{
	char tag1[MAXWLEN];
	readtag(tag1, in, out);

	return strcmp(tag1, tag2);
}

int
checktag(char *tag, int tagamt, FILE *in, FILE *out)
{
	int i;
	for (i = 0; i < tagamt; ++i)
		if (strcmp(tag, taglist[i]) == 0)
			return i;
	return -1;
}

/* Check if a character should be skipped. */
int
checkskip(char ch, int skiplen)
{
	int i;
	for (i = 0; i < skiplen; ++i)
		if (ch == skip[i])
			return 1;

	return 0;
}

const char *punct = "';.,\"!?:";
/* Check if a character is punctuation. */
int
checkpunct(char ch)
{
	int i;
	for (i = 0; punct[i] != '\0'; ++i)
		if (ch == punct[i])
			return 1;

	return 0;
}

const char *blank = " \n\r\t";
/* Check if a character is a blank. */
int
checkblank(char ch)
{
	int i;
	for (i = 0; blank[i] != '\0'; ++i)
		if (ch == blank[i])
			return 1;

	return 0;
}

/* Loop until the body is found. */
int
findbody(FILE *in, FILE *out)
{
	char ch;
	while ((ch = fgetc(in)) != EOF) {
		fputc(ch, out);
		if (ch == '<'
		&& cmptag("body", in, out) == 0)
			return 1;
	}

	return 0;
}

/* Hyphenate a word, by means of hyphen library.
This is done so as to leverage sufficient hyphenation
patterns, with the ones used here having been taken
from those developed for TeX. */
void
hypword(char *word, int len, FILE *in, FILE *out, HyphenDict *dict)
{
	if (len < MINWLEN) {
		fprintf(out, "%s", word);
		return;
	}

	char *hyphens = calloc(len + 6, sizeof(char));
	char *hyphword = calloc(len << 1, sizeof(char));
	char **rep = NULL;
	int *pos = NULL, *cut = NULL;
	hnj_hyphen_hyphenate2(dict, word, len, hyphens,
		hyphword, &rep, &pos, &cut);

/*	fprintf(stderr, "%s\n%s\n%s\n", word, hyphens, hyphword); */

	/* Process the given hyphenation. */
	int i;
	char oldch = ' ', oldoldch = '\0';
	for (i = 0; i < strlen(hyphword); ++i) {
		if (hyphword[i] == '=' && oldch != ' '
			&& oldoldch != ' ')
			fputs("&shy;", out);
		else if (hyphword[i] != '=')
			fputc(hyphword[i], out);

		oldoldch = oldch;
		oldch = hyphword[i];
	}

	free(hyphens);
	free(hyphword);
}

/* Hyphenate the words within a tag. */
void
hyptag(FILE *in, FILE *out, int skiplen, char *tag, HyphenDict *dict)
{
	char ch, word[MAXWLEN], term[MAXWLEN] = "/";
	int i = 0;

	strcat(term, tag);

	while ((ch = fgetc(in)) != EOF) {
		if (i < 0) {
			fputc(ch, out);
			++i;
			if (checkskip(ch, skiplen)) i -= 3;
			continue;
		}
		if (checkblank(ch)) {
			word[i] = '\0';
			hypword(word, i, in, out, dict);
			i = 0;
		}
		if (checkpunct(ch)) {
			word[i] = '\0';
			hypword(word, i, in, out, dict);
			fputc(ch, out);
			i = 0;
		}
		else if (checkskip(ch, skiplen)) {
			word[i] = '\0';
			fputs(word, out);
			fputc(ch, out);
			/* A simple way of working around
			HTML character codes. Each is 5 ( epsiv )
			 or 6 ( hellip ) characters long, plus '&' and ';'. */
			i = -3;
		}		
		/* Check for closing tag. */
		else if (ch == '<') {
			word[i] = ch;
			++i;
			word[i] = '\0';
			hypword(word, i, in, out, dict);
			i = 0;
			readtag(word, in, out);
			if (strcmp(word, term) == 0) break;
		}
		else {
			word[i] = ch;
			++i;
		}
		if (i == MAXWLEN - 1) {
			word[i] = '\0';
			hypword(word, i, in, out, dict);
			i = 0;
		}
	}
}

/* Hyphenate HTML input via `&shy;'.
hyp [in] [out] */
int
main(int argc, char **argv)
{
	FILE *in;
	if (argc < 2)
		in = stdin;
	else {
		in = fopen(argv[1], "r");
		if (in == NULL) {
			printf("%s %s\n", argv[1], "inaccessible.");
			return 1;
		}
	}

	FILE *out;
	if (argc < 3)
		out = stdout;
	else {
		out = fopen(argv[2], "w");
		if (out == NULL) {
			printf("%s %s\n", argv[2], "inaccessible.");
			return 2;
		}
	}

	if (findbody(in, out) == 0) {
		puts("There is no body.");
		return 3;
	}

	HyphenDict *dict = hnj_hyphen_load(dictfile);
	if (dict == NULL) {
		puts("Dict not readable.");
		return 4;
	}
	dict->utf8 = 1;

	int tagamt = 0;
	while (taglist[tagamt][0] != '\0')
		++tagamt;
	int skiplen = strlen(skip);

	char ch, tag[MAXWLEN];
	int len;
	while ((ch = fgetc(in)) != EOF) {
		fputc(ch, out);
		if (ch == '<' && (len = readtag(tag, in, out)) > 0
			&& checktag(tag, tagamt, in, out) != -1)
			hyptag(in, out, skiplen, tag, dict);
	}
	return 0;
}