#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <hyphen.h>
#include "config.h"
/* Read a tag into a character array and return its length. */
int
readtag(char *tag, FILE *in, FILE *out)
{
char ch;
int i = 0;
/* Data after a space in a tag is irrelevant. */
while ((ch = fgetc(in)) != '>' && ch != ' ') {
fputc(ch, out);
tag[i] = ch;
++i;
}
if (ch == ' ') {
/* Seek to the end of the tag. */
while (ch != '>'
&& ch != EOF) {
fputc(ch, out);
ch = fgetc(in);
}
if (ch == EOF) {
return -1;
}
fputc(ch, out);
}
else
fputc(ch, out);
tag[i] = '\0';
return i;
}
int
cmptag(char *tag2, FILE *in, FILE *out)
{
char tag1[MAXWLEN];
readtag(tag1, in, out);
return strcmp(tag1, tag2);
}
int
checktag(char *tag, int tagamt, FILE *in, FILE *out)
{
int i;
for (i = 0; i < tagamt; ++i)
if (strcmp(tag, taglist[i]) == 0)
return i;
return -1;
}
/* Check if a character should be skipped. */
int
checkskip(char ch, int skiplen)
{
int i;
for (i = 0; i < skiplen; ++i)
if (ch == skip[i])
return 1;
return 0;
}
const char *punct = "';.,\"!?:";
/* Check if a character is punctuation. */
int
checkpunct(char ch)
{
int i;
for (i = 0; punct[i] != '\0'; ++i)
if (ch == punct[i])
return 1;
return 0;
}
const char *blank = " \n\r\t";
/* Check if a character is a blank. */
int
checkblank(char ch)
{
int i;
for (i = 0; blank[i] != '\0'; ++i)
if (ch == blank[i])
return 1;
return 0;
}
/* Loop until the body is found. */
int
findbody(FILE *in, FILE *out)
{
char ch;
while ((ch = fgetc(in)) != EOF) {
fputc(ch, out);
if (ch == '<'
&& cmptag("body", in, out) == 0)
return 1;
}
return 0;
}
/* Hyphenate a word, by means of hyphen library.
This is done so as to leverage sufficient hyphenation
patterns, with the ones used here having been taken
from those developed for TeX. */
void
hypword(char *word, int len, FILE *in, FILE *out, HyphenDict *dict)
{
if (len < MINWLEN) {
fprintf(out, "%s", word);
return;
}
char *hyphens = calloc(len + 6, sizeof(char));
char *hyphword = calloc(len << 1, sizeof(char));
char **rep = NULL;
int *pos = NULL, *cut = NULL;
hnj_hyphen_hyphenate2(dict, word, len, hyphens,
hyphword, &rep, &pos, &cut);
/* fprintf(stderr, "%s\n%s\n%s\n", word, hyphens, hyphword); */
/* Process the given hyphenation. */
int i;
char oldch = ' ', oldoldch = '\0';
for (i = 0; i < strlen(hyphword); ++i) {
if (hyphword[i] == '=' && oldch != ' '
&& oldoldch != ' ')
fputs("­", out);
else if (hyphword[i] != '=')
fputc(hyphword[i], out);
oldoldch = oldch;
oldch = hyphword[i];
}
free(hyphens);
free(hyphword);
}
/* Hyphenate the words within a tag. */
void
hyptag(FILE *in, FILE *out, int skiplen, char *tag, HyphenDict *dict)
{
char ch, word[MAXWLEN], term[MAXWLEN] = "/";
int i = 0;
strcat(term, tag);
while ((ch = fgetc(in)) != EOF) {
if (i < 0) {
fputc(ch, out);
++i;
if (checkskip(ch, skiplen)) i -= 3;
continue;
}
if (checkblank(ch)) {
word[i] = '\0';
hypword(word, i, in, out, dict);
i = 0;
}
if (checkpunct(ch)) {
word[i] = '\0';
hypword(word, i, in, out, dict);
fputc(ch, out);
i = 0;
}
else if (checkskip(ch, skiplen)) {
word[i] = '\0';
fputs(word, out);
fputc(ch, out);
/* A simple way of working around
HTML character codes. Each is 5 ( epsiv )
or 6 ( hellip ) characters long, plus '&' and ';'. */
i = -3;
}
/* Check for closing tag. */
else if (ch == '<') {
word[i] = ch;
++i;
word[i] = '\0';
hypword(word, i, in, out, dict);
i = 0;
readtag(word, in, out);
if (strcmp(word, term) == 0) break;
}
else {
word[i] = ch;
++i;
}
if (i == MAXWLEN - 1) {
word[i] = '\0';
hypword(word, i, in, out, dict);
i = 0;
}
}
}
/* Hyphenate HTML input via `­'.
hyp [in] [out] */
int
main(int argc, char **argv)
{
FILE *in;
if (argc < 2)
in = stdin;
else {
in = fopen(argv[1], "r");
if (in == NULL) {
printf("%s %s\n", argv[1], "inaccessible.");
return 1;
}
}
FILE *out;
if (argc < 3)
out = stdout;
else {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("%s %s\n", argv[2], "inaccessible.");
return 2;
}
}
if (findbody(in, out) == 0) {
puts("There is no body.");
return 3;
}
HyphenDict *dict = hnj_hyphen_load(dictfile);
if (dict == NULL) {
puts("Dict not readable.");
return 4;
}
dict->utf8 = 1;
int tagamt = 0;
while (taglist[tagamt][0] != '\0')
++tagamt;
int skiplen = strlen(skip);
char ch, tag[MAXWLEN];
int len;
while ((ch = fgetc(in)) != EOF) {
fputc(ch, out);
if (ch == '<' && (len = readtag(tag, in, out)) > 0
&& checktag(tag, tagamt, in, out) != -1)
hyptag(in, out, skiplen, tag, dict);
}
return 0;
}