#include #include #include #include #include "config.h" extern char *dictfile; /* Read a tag into a character array and return its length. */ int readtag(char *tag, FILE *in, FILE *out) { char ch; int i = 0; /* Data after a space in a tag is irrelevant. */ while ((ch = fgetc(in)) != '>' && ch != ' ') { fputc(ch, out); tag[i] = ch; ++i; } if (ch == ' ') { /* Seek to the end of the tag. */ while (ch != '>' && ch != EOF) { fputc(ch, out); ch = fgetc(in); } if (ch == EOF) { return -1; } fputc(ch, out); } else fputc(ch, out); tag[i] = '\0'; return i; } int cmptag(char *tag2, FILE *in, FILE *out) { char tag1[MAXWLEN]; readtag(tag1, in, out); return strcmp(tag1, tag2); } int checktag(char *tag, int tagamt, FILE *in, FILE *out) { int i; for (i = 0; i < tagamt; ++i) if (strcmp(tag, taglist[i]) == 0) return i; return -1; } /* Check if a character should be skipped. */ int checkskip(char ch, int skiplen) { int i; for (i = 0; i < skiplen; ++i) if (ch == skip[i]) return 1; return 0; } const char *punct = "';.,\"!?:"; /* Check if a character is punctuation. */ int checkpunct(char ch) { int i; for (i = 0; punct[i] != '\0'; ++i) if (ch == punct[i]) return 1; return 0; } const char *blank = " \n\r\t"; /* Check if a character is a blank. */ int checkblank(char ch) { int i; for (i = 0; blank[i] != '\0'; ++i) if (ch == blank[i]) return 1; return 0; } /* Loop until the body is found. */ int findbody(FILE *in, FILE *out) { char ch; while ((ch = fgetc(in)) != EOF) { fputc(ch, out); if (ch == '<' && cmptag("body", in, out) == 0) return 1; } return 0; } /* Hyphenate a word, by means of hyphen library. This is done so as to leverage sufficient hyphenation patterns, with the ones used here having been taken from those developed for TeX. */ void hypword(char *word, int len, FILE *in, FILE *out, HyphenDict *dict) { if (len < MINWLEN) { fprintf(out, "%s", word); return; } char *hyphens = calloc(len + 6, sizeof(char)); char *hyphword = calloc(len << 1, sizeof(char)); char **rep = NULL; int *pos = NULL, *cut = NULL; hnj_hyphen_hyphenate2(dict, word, len, hyphens, hyphword, &rep, &pos, &cut); /* fprintf(stderr, "%s\n%s\n%s\n", word, hyphens, hyphword); */ /* Process the given hyphenation. */ int i; char oldch = ' ', oldoldch = '\0'; for (i = 0; i < strlen(hyphword); ++i) { if (hyphword[i] == '=' && oldch != ' ' && oldoldch != ' ') fputs("­", out); else if (hyphword[i] != '=') fputc(hyphword[i], out); oldoldch = oldch; oldch = hyphword[i]; } free(hyphens); free(hyphword); } /* Hyphenate the words within a tag. */ void hyptag(FILE *in, FILE *out, int skiplen, char *tag, HyphenDict *dict) { char ch, word[MAXWLEN], term[MAXWLEN] = "/"; int i = 0; strcat(term, tag); while ((ch = fgetc(in)) != EOF) { if (i < 0) { fputc(ch, out); ++i; if (checkskip(ch, skiplen)) i -= 3; continue; } if (checkblank(ch)) { word[i] = '\0'; hypword(word, i, in, out, dict); i = 0; } if (checkpunct(ch)) { word[i] = '\0'; hypword(word, i, in, out, dict); fputc(ch, out); i = 0; } else if (checkskip(ch, skiplen)) { word[i] = '\0'; fputs(word, out); fputc(ch, out); /* A simple way of working around HTML character codes. Each is 5 ( epsiv ) or 6 ( hellip ) characters long, plus '&' and ';'. */ i = -3; } /* Check for closing tag. */ else if (ch == '<') { word[i] = ch; ++i; word[i] = '\0'; hypword(word, i, in, out, dict); i = 0; readtag(word, in, out); if (strcmp(word, term) == 0) break; } else { word[i] = ch; ++i; } if (i == MAXWLEN - 1) { word[i] = '\0'; hypword(word, i, in, out, dict); i = 0; } } } /* Hyphenate HTML input via `­'. hyp [in] [out] */ int main(int argc, char **argv) { FILE *in; if (argc < 2) in = stdin; else { in = fopen(argv[1], "r"); if (in == NULL) { printf("%s %s\n", argv[1], "inaccessible."); return 1; } } FILE *out; if (argc < 3) out = stdout; else { out = fopen(argv[2], "w"); if (out == NULL) { printf("%s %s\n", argv[2], "inaccessible."); return 2; } } if (findbody(in, out) == 0) { puts("There is no body."); return 3; } HyphenDict *dict = hnj_hyphen_load(dictfile); if (dict == NULL) { puts("Dict not readable."); return 4; } dict->utf8 = 1; int tagamt = 0; while (taglist[tagamt][0] != '\0') ++tagamt; int skiplen = strlen(skip); char ch, tag[MAXWLEN]; int len; while ((ch = fgetc(in)) != EOF) { fputc(ch, out); if (ch == '<' && (len = readtag(tag, in, out)) > 0 && checktag(tag, tagamt, in, out) != -1) hyptag(in, out, skiplen, tag, dict); } return 0; }