/*
* $LynxId: LYCharUtils.c,v 1.116 2011/12/01 09:41:59 tom Exp $
*
* Functions associated with LYCharSets.c and the Lynx version of HTML.c - FM
* ==========================================================================
*/
#include <HTUtils.h>
#include <SGML.h>
#define Lynx_HTML_Handler
#include <HTChunk.h>
#include <HText.h>
#include <HTStyle.h>
#include <HTMIME.h>
#include <HTML.h>
#include <HTCJK.h>
#include <HTAtom.h>
#include <HTMLGen.h>
#include <HTParse.h>
#include <UCMap.h>
#include <UCDefs.h>
#include <UCAux.h>
#include <LYGlobalDefs.h>
#include <LYCharUtils.h>
#include <LYCharSets.h>
#include <HTAlert.h>
#include <HTForms.h>
#include <HTNestedList.h>
#include <GridText.h>
#include <LYStrings.h>
#include <LYUtils.h>
#include <LYMap.h>
#include <LYBookmark.h>
#include <LYCurses.h>
#include <LYCookie.h>
#include <LYexit.h>
#include <LYLeaks.h>
/*
* Used for nested lists. - FM
*/
int OL_CONTINUE = -29999; /* flag for whether CONTINUE is set */
int OL_VOID = -29998; /* flag for whether a count is set */
/*
* This function converts any ampersands in allocated
* strings to "&". If isTITLE is TRUE, it also
* converts any angle-brackets to "<" or ">". - FM
*/
void LYEntify(char **str,
int isTITLE)
{
char *p = *str;
char *q = NULL, *cp = NULL;
int amps = 0, lts = 0, gts = 0;
#ifdef CJK_EX
enum _state {
S_text,
S_esc,
S_dollar,
S_paren,
S_nonascii_text,
S_dollar_paren
} state = S_text;
int in_sjis = 0;
#endif
if (isEmpty(p))
return;
/*
* Count the ampersands. - FM
*/
while ((*p != '\0') && (q = strchr(p, '&')) != NULL) {
amps++;
p = (q + 1);
}
/*
* Count the left-angle-brackets, if needed. - FM
*/
if (isTITLE == TRUE) {
p = *str;
while ((*p != '\0') && (q = strchr(p, '<')) != NULL) {
lts++;
p = (q + 1);
}
}
/*
* Count the right-angle-brackets, if needed. - FM
*/
if (isTITLE == TRUE) {
p = *str;
while ((*p != '\0') && (q = strchr(p, '>')) != NULL) {
gts++;
p = (q + 1);
}
}
/*
* Check whether we need to convert anything. - FM
*/
if (amps == 0 && lts == 0 && gts == 0)
return;
/*
* Allocate space and convert. - FM
*/
q = typecallocn(char,
(strlen(*str)
+ (unsigned)(4 * amps)
+ (unsigned)(3 * lts)
+ (unsigned)(3 * gts) + 1));
if ((cp = q) == NULL)
outofmem(__FILE__, "LYEntify");
assert(cp != NULL);
assert(q != NULL);
for (p = *str; *p; p++) {
#ifdef CJK_EX
if (IS_CJK_TTY) {
switch (state) {
case S_text:
if (*p == '\033') {
state = S_esc;
*q++ = *p;
continue;
}
break;
case S_esc:
if (*p == '$') {
state = S_dollar;
*q++ = *p;
continue;
} else if (*p == '(') {
state = S_paren;
*q++ = *p;
continue;
} else {
state = S_text;
*q++ = *p;
continue;
}
case S_dollar:
if (*p == '@' || *p == 'B' || *p == 'A') {
state = S_nonascii_text;
*q++ = *p;
continue;
} else if (*p == '(') {
state = S_dollar_paren;
*q++ = *p;
continue;
} else {
state = S_text;
*q++ = *p;
continue;
}
case S_dollar_paren:
if (*p == 'C') {
state = S_nonascii_text;
*q++ = *p;
continue;
} else {
state = S_text;
*q++ = *p;
continue;
}
case S_paren:
if (*p == 'B' || *p == 'J' || *p == 'T') {
state = S_text;
*q++ = *p;
continue;
} else if (*p == 'I') {
state = S_nonascii_text;
*q++ = *p;
continue;
}
/* FALLTHRU */
case S_nonascii_text:
if (*p == '\033')
state = S_esc;
*q++ = *p;
continue;
default:
break;
}
if (*(p + 1) != '\0' &&
(IS_EUC(UCH(*p), UCH(*(p + 1))) ||
IS_SJIS(UCH(*p), UCH(*(p + 1)), in_sjis) ||
IS_BIG5(UCH(*p), UCH(*(p + 1))))) {
*q++ = *p++;
*q++ = *p;
continue;
}
}
#endif
if (*p == '&') {
*q++ = '&';
*q++ = 'a';
*q++ = 'm';
*q++ = 'p';
*q++ = ';';
} else if (isTITLE && *p == '<') {
*q++ = '&';
*q++ = 'l';
*q++ = 't';
*q++ = ';';
} else if (isTITLE && *p == '>') {
*q++ = '&';
*q++ = 'g';
*q++ = 't';
*q++ = ';';
} else {
*q++ = *p;
}
}
*q = '\0';
FREE(*str);
*str = cp;
}
/*
* Callers to LYEntifyTitle/LYEntifyValue do not look at the 'target' param.
* Optimize things a little by avoiding the memory allocation if not needed,
* as is usually the case.
*/
static BOOL MustEntify(const char *source)
{
BOOL result;
#ifdef CJK_EX
if (IS_CJK_TTY && strchr(source, '\033') != 0) {
result = TRUE;
} else
#endif
{
size_t length = strlen(source);
size_t reject = strcspn(source, "<&>");
result = (BOOL) (length != reject);
}
return result;
}
/*
* Wrappers for LYEntify() which do not assume that the source was allocated,
* e.g., output from gettext().
*/
const char *LYEntifyTitle(char **target, const char *source)
{
const char *result = 0;
if (MustEntify(source)) {
StrAllocCopy(*target, source);
LYEntify(target, TRUE);
result = *target;
} else {
result = source;
}
return result;
}
const char *LYEntifyValue(char **target, const char *source)
{
const char *result = 0;
if (MustEntify(source)) {
StrAllocCopy(*target, source);
LYEntify(target, FALSE);
result = *target;
} else {
result = source;
}
return result;
}
/*
* This function trims characters <= that of a space (32),
* including HT_NON_BREAK_SPACE (1) and HT_EN_SPACE (2),
* but not ESC, from the heads of strings. - FM
*/
void LYTrimHead(char *str)
{
const char *s = str;
if (isEmpty(s))
return;
while (*s && WHITE(*s) && UCH(*s) != UCH(CH_ESC)) /* S/390 -- gil -- 1669 */
s++;
if (s > str) {
char *ns = str;
while (*s) {
*ns++ = *s++;
}
*ns = '\0';
}
}
/*
* This function trims characters <= that of a space (32),
* including HT_NON_BREAK_SPACE (1), HT_EN_SPACE (2), and
* ESC from the tails of strings. - FM
*/
void LYTrimTail(char *str)
{
int i;
if (isEmpty(str))
return;
i = (int) strlen(str) - 1;
while (i >= 0) {
if (WHITE(str[i]))
str[i] = '\0';
else
break;
i--;
}
}
/*
* This function should receive a pointer to the start
* of a comment. It returns a pointer to the end ('>')
* character of comment, or it's best guess if the comment
* is invalid. - FM
*/
char *LYFindEndOfComment(char *str)
{
char *cp, *cp1;
enum comment_state {
start1,
start2,
end1,
end2
} state;
if (str == NULL)
/*
* We got NULL, so return NULL. - FM
*/
return NULL;
if (StrNCmp(str, "<!--", 4))
/*
* We don't have the start of a comment, so return the beginning of the
* string. - FM
*/
return str;
cp = (str + 4);
if (*cp == '>')
/*
* It's an invalid comment, so
* return this end character. - FM
*/
return cp;
if ((cp1 = strchr(cp, '>')) == NULL)
/*
* We don't have an end character, so return the beginning of the
* string. - FM
*/
return str;
if (*cp == '-')
/*
* Ugh, it's a "decorative" series of dashes, so return the next end
* character. - FM
*/
return cp1;
/*
* OK, we're ready to start parsing. - FM
*/
state = start2;
while (*cp != '\0') {
switch (state) {
case start1:
if (*cp == '-')
state = start2;
else
/*
* Invalid comment, so return the first '>' from the start of
* the string. - FM
*/
return cp1;
break;
case start2:
if (*cp == '-')
state = end1;
break;
case end1:
if (*cp == '-')
state = end2;
else
/*
* Invalid comment, so return the first '>' from the start of
* the string. - FM
*/
return cp1;
break;
case end2:
if (*cp == '>')
/*
* Valid comment, so return the end character. - FM
*/
return cp;
if (*cp == '-') {
state = start1;
} else if (!(WHITE(*cp) && UCH(*cp) != UCH(CH_ESC))) { /* S/390 -- gil -- 1686 */
/*
* Invalid comment, so return the first '>' from the start of
* the string. - FM
*/
return cp1;
}
break;
default:
break;
}
cp++;
}
/*
* Invalid comment, so return the first '>' from the start of the string.
* - FM
*/
return cp1;
}
/*
* If an HREF, itself or if resolved against a base,
* represents a file URL, and the host is defaulted,
* force in "//localhost". We need this until
* all the other Lynx code which performs security
* checks based on the "localhost" string is changed
* to assume "//localhost" when a host field is not
* present in file URLs - FM
*/
void LYFillLocalFileURL(char **href,
const char *base)
{
char *temp = NULL;
if (isEmpty(*href))
return;
if (!strcmp(*href, "//") || !StrNCmp(*href, "///", 3)) {
if (base != NULL && isFILE_URL(base)) {
StrAllocCopy(temp, STR_FILE_URL);
StrAllocCat(temp, *href);
StrAllocCopy(*href, temp);
}
}
if (isFILE_URL(*href)) {
if (*(*href + 5) == '\0') {
StrAllocCat(*href, "//localhost");
} else if (!strcmp(*href, "file://")) {
StrAllocCat(*href, "localhost");
} else if (!StrNCmp(*href, "file:///", 8)) {
StrAllocCopy(temp, (*href + 7));
LYLocalFileToURL(href, temp);
} else if (!StrNCmp(*href, "file:/", 6) && !LYIsHtmlSep(*(*href + 6))) {
StrAllocCopy(temp, (*href + 5));
LYLocalFileToURL(href, temp);
}
}
#if defined(USE_DOS_DRIVES)
if (LYIsDosDrive(*href)) {
/*
* If it's a local DOS path beginning with drive letter,
* add file://localhost/ prefix and go ahead.
*/
StrAllocCopy(temp, *href);
LYLocalFileToURL(href, temp);
}
/* use below: strlen("file://localhost/") = 17 */
if (!StrNCmp(*href, "file://localhost/", 17)
&& (strlen(*href) == 19)
&& LYIsDosDrive(*href + 17)) {
/*
* Terminate DOS drive letter with a slash to surf root successfully.
* Here seems a proper place to do so.
*/
LYAddPathSep(href);
}
#endif /* USE_DOS_DRIVES */
/*
* No path in a file://localhost URL means a
* directory listing for the current default. - FM
*/
if (!strcmp(*href, "file://localhost")) {
const char *temp2;
#ifdef VMS
temp2 = HTVMS_wwwName(LYGetEnv("PATH"));
#else
char curdir[LY_MAXPATH];
temp2 = wwwName(Current_Dir(curdir));
#endif /* VMS */
if (!LYIsHtmlSep(*temp2))
LYAddHtmlSep(href);
/*
* Check for pathological cases - current dir has chars which MUST BE
* URL-escaped - kw
*/
if (strchr(temp2, '%') != NULL || strchr(temp2, '#') != NULL) {
FREE(temp);
temp = HTEscape(temp2, URL_PATH);
StrAllocCat(*href, temp);
} else {
StrAllocCat(*href, temp2);
}
}
#ifdef VMS
/*
* On VMS, a file://localhost/ URL means
* a listing for the login directory. - FM
*/
if (!strcmp(*href, "file://localhost/"))
StrAllocCat(*href, (HTVMS_wwwName(Home_Dir()) + 1));
#endif /* VMS */
FREE(temp);
return;
}
void LYAddMETAcharsetToStream(HTStream *target, int disp_chndl)
{
char *buf = 0;
if (disp_chndl == -1)
/*
* -1 means use current_char_set.
*/
disp_chndl = current_char_set;
if (target != 0 && disp_chndl >= 0) {
HTSprintf0(&buf, "<META %s content=\"text/html;charset=%s\">\n",
"http-equiv=\"content-type\"",
LYCharSet_UC[disp_chndl].MIMEname);
(*target->isa->put_string) (target, buf);
FREE(buf);
}
}
/*
* This function writes a line with a META tag to an open file,
* which will specify a charset parameter to use when the file is
* read back in. It is meant for temporary HTML files used by the
* various special pages which may show titles of documents. When those
* files are created, the title strings normally have been translated and
* expanded to the display character set, so we have to make sure they
* don't get translated again.
* If the user has changed the display character set during the lifetime
* of the Lynx session (or, more exactly, during the time the title
* strings to be written were generated), they may now have different
* character encodings and there is currently no way to get it all right.
* To change this, we would have to add a variable for each string which
* keeps track of its character encoding.
* But at least we can try to ensure that reading the file after future
* display character set changes will give reasonable output.
*
* The META tag is not written if the display character set (passed as
* disp_chndl) already corresponds to the charset assumption that
* would be made when the file is read. - KW
*
* Currently this function is used for temporary files like "Lynx Info Page"
* and for one permanent - bookmarks (so it may be a problem if you change
* the display charset later: new bookmark entries may be mistranslated).
* - LP
*/
void LYAddMETAcharsetToFD(FILE *fd, int disp_chndl)
{
if (disp_chndl == -1)
/*
* -1 means use current_char_set.
*/
disp_chndl = current_char_set;
if (fd == NULL || disp_chndl < 0)
/*
* Should not happen.
*/
return;
if (UCLYhndl_HTFile_for_unspec == disp_chndl)
/*
* Not need to do, so we don't.
*/
return;
if (LYCharSet_UC[disp_chndl].enc == UCT_ENC_7BIT)
/*
* There shouldn't be any 8-bit characters in this case.
*/
return;
/*
* In other cases we don't know because UCLYhndl_for_unspec may change
* during the lifetime of the file (by toggling raw mode or changing the
* display character set), so proceed.
*/
fprintf(fd, "<META %s content=\"text/html;charset=%s\">\n",
"http-equiv=\"content-type\"",
LYCharSet_UC[disp_chndl].MIMEname);
}
/*
* This function returns OL TYPE="A" strings in
* the range of " A." (1) to "ZZZ." (18278). - FM
*/
char *LYUppercaseA_OL_String(int seqnum)
{
static char OLstring[8];
if (seqnum <= 1) {
strcpy(OLstring, " A.");
return OLstring;
}
if (seqnum < 27) {
sprintf(OLstring, " %c.", (seqnum + 64));
return OLstring;
}
if (seqnum < 703) {
sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 64),
(seqnum - ((seqnum - 1) / 26) * 26 + 64));
return OLstring;
}
if (seqnum < 18279) {
sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 64),
(((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 64),
(seqnum - ((seqnum - 1) / 26) * 26 + 64));
return OLstring;
}
strcpy(OLstring, "ZZZ.");
return OLstring;
}
/*
* This function returns OL TYPE="a" strings in
* the range of " a." (1) to "zzz." (18278). - FM
*/
char *LYLowercaseA_OL_String(int seqnum)
{
static char OLstring[8];
if (seqnum <= 1) {
strcpy(OLstring, " a.");
return OLstring;
}
if (seqnum < 27) {
sprintf(OLstring, " %c.", (seqnum + 96));
return OLstring;
}
if (seqnum < 703) {
sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 96),
(seqnum - ((seqnum - 1) / 26) * 26 + 96));
return OLstring;
}
if (seqnum < 18279) {
sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 96),
(((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 96),
(seqnum - ((seqnum - 1) / 26) * 26 + 96));
return OLstring;
}
strcpy(OLstring, "zzz.");
return OLstring;
}
/*
* This function returns OL TYPE="I" strings in the
* range of " I." (1) to "MMM." (3000).- FM
* Maximum length: 16 -TD
*/
char *LYUppercaseI_OL_String(int seqnum)
{
static char OLstring[20];
int Arabic = seqnum;
if (Arabic >= 3000) {
strcpy(OLstring, "MMM.");
return OLstring;
}
switch (Arabic) {
case 1:
strcpy(OLstring, " I.");
return OLstring;
case 5:
strcpy(OLstring, " V.");
return OLstring;
case 10:
strcpy(OLstring, " X.");
return OLstring;
case 50:
strcpy(OLstring, " L.");
return OLstring;
case 100:
strcpy(OLstring, " C.");
return OLstring;
case 500:
strcpy(OLstring, " D.");
return OLstring;
case 1000:
strcpy(OLstring, " M.");
return OLstring;
default:
OLstring[0] = '\0';
break;
}
while (Arabic >= 1000) {
strcat(OLstring, "M");
Arabic -= 1000;
}
if (Arabic >= 900) {
strcat(OLstring, "CM");
Arabic -= 900;
}
if (Arabic >= 500) {
strcat(OLstring, "D");
Arabic -= 500;
while (Arabic >= 500) {
strcat(OLstring, "C");
Arabic -= 10;
}
}
if (Arabic >= 400) {
strcat(OLstring, "CD");
Arabic -= 400;
}
while (Arabic >= 100) {
strcat(OLstring, "C");
Arabic -= 100;
}
if (Arabic >= 90) {
strcat(OLstring, "XC");
Arabic -= 90;
}
if (Arabic >= 50) {
strcat(OLstring, "L");
Arabic -= 50;
while (Arabic >= 50) {
strcat(OLstring, "X");
Arabic -= 10;
}
}
if (Arabic >= 40) {
strcat(OLstring, "XL");
Arabic -= 40;
}
while (Arabic > 10) {
strcat(OLstring, "X");
Arabic -= 10;
}
switch (Arabic) {
case 1:
strcat(OLstring, "I.");
break;
case 2:
strcat(OLstring, "II.");
break;
case 3:
strcat(OLstring, "III.");
break;
case 4:
strcat(OLstring, "IV.");
break;
case 5:
strcat(OLstring, "V.");
break;
case 6:
strcat(OLstring, "VI.");
break;
case 7:
strcat(OLstring, "VII.");
break;
case 8:
strcat(OLstring, "VIII.");
break;
case 9:
strcat(OLstring, "IX.");
break;
case 10:
strcat(OLstring, "X.");
break;
default:
strcat(OLstring, ".");
break;
}
return OLstring;
}
/*
* This function returns OL TYPE="i" strings in
* range of " i." (1) to "mmm." (3000).- FM
* Maximum length: 16 -TD
*/
char *LYLowercaseI_OL_String(int seqnum)
{
static char OLstring[20];
int Arabic = seqnum;
if (Arabic >= 3000) {
strcpy(OLstring, "mmm.");
return OLstring;
}
switch (Arabic) {
case 1:
strcpy(OLstring, " i.");
return OLstring;
case 5:
strcpy(OLstring, " v.");
return OLstring;
case 10:
strcpy(OLstring, " x.");
return OLstring;
case 50:
strcpy(OLstring, " l.");
return OLstring;
case 100:
strcpy(OLstring, " c.");
return OLstring;
case 500:
strcpy(OLstring, " d.");
return OLstring;
case 1000:
strcpy(OLstring, " m.");
return OLstring;
default:
OLstring[0] = '\0';
break;
}
while (Arabic >= 1000) {
strcat(OLstring, "m");
Arabic -= 1000;
}
if (Arabic >= 900) {
strcat(OLstring, "cm");
Arabic -= 900;
}
if (Arabic >= 500) {
strcat(OLstring, "d");
Arabic -= 500;
while (Arabic >= 500) {
strcat(OLstring, "c");
Arabic -= 10;
}
}
if (Arabic >= 400) {
strcat(OLstring, "cd");
Arabic -= 400;
}
while (Arabic >= 100) {
strcat(OLstring, "c");
Arabic -= 100;
}
if (Arabic >= 90) {
strcat(OLstring, "xc");
Arabic -= 90;
}
if (Arabic >= 50) {
strcat(OLstring, "l");
Arabic -= 50;
while (Arabic >= 50) {
strcat(OLstring, "x");
Arabic -= 10;
}
}
if (Arabic >= 40) {
strcat(OLstring, "xl");
Arabic -= 40;
}
while (Arabic > 10) {
strcat(OLstring, "x");
Arabic -= 10;
}
switch (Arabic) {
case 1:
strcat(OLstring, "i.");
break;
case 2:
strcat(OLstring, "ii.");
break;
case 3:
strcat(OLstring, "iii.");
break;
case 4:
strcat(OLstring, "iv.");
break;
case 5:
strcat(OLstring, "v.");
break;
case 6:
strcat(OLstring, "vi.");
break;
case 7:
strcat(OLstring, "vii.");
break;
case 8:
strcat(OLstring, "viii.");
break;
case 9:
strcat(OLstring, "ix.");
break;
case 10:
strcat(OLstring, "x.");
break;
default:
strcat(OLstring, ".");
break;
}
return OLstring;
}
/*
* This function initializes the Ordered List counter. - FM
*/
void LYZero_OL_Counter(HTStructured * me)
{
int i;
if (!me)
return;
for (i = 0; i < 12; i++) {
me->OL_Counter[i] = OL_VOID;
me->OL_Type[i] = '1';
}
me->Last_OL_Count = 0;
me->Last_OL_Type = '1';
return;
}
/*
* This function is used by the HTML Structured object. - KW
*/
void LYGetChartransInfo(HTStructured * me)
{
me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
UCT_STAGE_STRUCTURED);
if (me->UCLYhndl < 0) {
int chndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_HTEXT);
if (chndl < 0) {
chndl = current_char_set;
HTAnchor_setUCInfoStage(me->node_anchor, chndl,
UCT_STAGE_HTEXT,
UCT_SETBY_STRUCTURED);
}
HTAnchor_setUCInfoStage(me->node_anchor, chndl,
UCT_STAGE_STRUCTURED,
UCT_SETBY_STRUCTURED);
me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
UCT_STAGE_STRUCTURED);
}
me->UCI = HTAnchor_getUCInfoStage(me->node_anchor,
UCT_STAGE_STRUCTURED);
}
/* as in HTParse.c, saves some calls - kw */
static const char *hex = "0123456789ABCDEF";
/*
* Any raw 8-bit or multibyte characters already have been
* handled in relation to the display character set
* in SGML_character(), including named and numeric entities.
*
* This function used for translations HTML special fields inside tags
* (ALT=, VALUE=, etc.) from charset `cs_from' to charset `cs_to'.
* It also unescapes non-ASCII characters from URL (#fragments !)
* if st_URL is active.
*
* If `do_ent' is YES, it converts named entities
* and numeric character references (NCRs) to their `cs_to' replacements.
*
* Named entities converted to unicodes. NCRs (unicodes) converted
* by UCdomap.c chartrans functions.
* ???NCRs with values in the ISO-8859-1 range 160-255 may be converted
* to their HTML entity names (via old-style entities) and then translated
* according to the LYCharSets.c array for `cs_out'???.
*
* Some characters (see descriptions in `put_special_unicodes' from SGML.c)
* translated in relation with the state of boolean variables
* `use_lynx_specials', `plain_space' and `hidden'. It is not clear yet:
*
* If plain_space is TRUE, nbsp (160) will be treated as an ASCII
* space (32). If hidden is TRUE, entities will be translated
* (if `do_ent' is YES) but escape sequences will be passed unaltered.
* If `hidden' is FALSE, some characters are converted to Lynx special
* codes (see `put_special_unicodes') or ASCII space if `plain_space'
* applies). @@ is `use_lynx_specials' needed, does it have any effect? @@
* If `use_lynx_specials' is YES, translate byte values 160 and 173
* meaning U+00A0 and U+00AD given as or converted from raw char input
* are converted to HT_NON_BREAK_SPACE and LY_SOFT_HYPHEN, respectively
* (unless input and output charset are both iso-8859-1, for compatibility
* with previous usage in HTML.c) even if `hidden' or `plain_space' is set.
*
* If `Back' is YES, the reverse is done instead i.e., Lynx special codes
* in the input are translated back to character values.
*
* If `Back' is YES, an attempt is made to use UCReverseTransChar() for
* back translation which may be more efficient. (?)
*
* If `stype' is st_URL, non-ASCII characters are URL-encoded instead.
* The sequence of bytes being URL-encoded is the raw input character if
* we couldn't translate it from `cs_in' (CJK etc.); otherwise it is the
* UTF-8 representation if either `cs_to' requires this or if the
* character's Unicode value is > 255, otherwise it should be the iso-8859-1
* representation.
* No general URL-encoding occurs for displayable ASCII characters and
* spaces and some C0 controls valid in HTML (LF, TAB), it is expected
* that other functions will take care of that as appropriate.
*
* Escape characters (0x1B, '\033') are
* - URL-encoded if `stype' is st_URL, otherwise
* - dropped if `stype' is st_other, otherwise (i.e., st_HTML)
* - passed if `hidden' is TRUE or HTCJK is set, otherwise
* - dropped.
*
* (If `stype' is st_URL or st_other most of the parameters really predefined:
* cs_from=cs_to, use_lynx_specials=plain_space=NO, and hidden=YES)
*
*
* Returns pointer to the char** passed in
* if string translated or translation unnecessary,
* NULL otherwise
* (in which case something probably went wrong.)
*
*
* In general, this somehow ugly function (KW)
* cover three functions from v.2.7.2 (FM):
* extern void LYExpandString (
* HTStructured * me,
* char ** str);
* extern void LYUnEscapeEntities (
* HTStructured * me,
* char ** str);
* extern void LYUnEscapeToLatinOne (
* HTStructured * me,
* char ** str,
* BOOLEAN isURL);
*/
char **LYUCFullyTranslateString(char **str,
int cs_from,
int cs_to,
int do_ent,
int use_lynx_specials,
int plain_space,
int hidden,
int Back,
CharUtil_st stype)
{
char *p;
char *q, *qs;
HTChunk *chunk = NULL;
char *cp = 0;
char cpe = 0;
char *esc = NULL;
char replace_buf[64];
int uck;
int lowest_8;
UCode_t code = 0;
BOOL output_utf8 = 0, repl_translated_C0 = 0;
size_t len;
const char *name = NULL;
BOOLEAN no_bytetrans;
UCTransParams T;
BOOL from_is_utf8 = FALSE;
char *puni;
enum _state {
S_text,
S_esc,
S_dollar,
S_paren,
S_nonascii_text,
S_dollar_paren,
S_trans_byte,
S_check_ent,
S_ncr,
S_check_uni,
S_named,
S_check_name,
S_recover,
S_got_oututf8,
S_got_outstring,
S_put_urlstring,
S_got_outchar,
S_put_urlchar,
S_next_char,
S_done
} state = S_text;
enum _parsing_what {
P_text,
P_utf8,
P_hex,
P_decimal,
P_named
} what = P_text;
#ifdef KANJI_CODE_OVERRIDE
static unsigned char sjis_1st = '\0';
unsigned char sjis_str[3];
#endif
/*
* Make sure we have a non-empty string. - FM
*/
if (isEmpty(*str))
return str;
/*
* FIXME: something's wrong with the limit checks here (clearing the
* buffer helps).
*/
memset(replace_buf, 0, sizeof(replace_buf));
/*
* Don't do byte translation if original AND target character sets are both
* iso-8859-1 (and we are not called to back-translate), or if we are in
* CJK mode.
*/
if (IS_CJK_TTY
#ifdef EXP_JAPANESEUTF8_SUPPORT
&& (strcmp(LYCharSet_UC[cs_from].MIMEname, "utf-8") != 0)
&& (strcmp(LYCharSet_UC[cs_to].MIMEname, "utf-8") != 0)
#endif
) {
no_bytetrans = TRUE;
} else if (cs_to <= 0 && cs_from == cs_to && (!Back || cs_to < 0)) {
no_bytetrans = TRUE;
} else {
/* No need to translate or examine the string any further */
no_bytetrans = (BOOL) (!use_lynx_specials && !Back &&
UCNeedNotTranslate(cs_from, cs_to));
}
/*
* Save malloc/calloc overhead in simple case - kw
*/
if (do_ent && hidden && (stype != st_URL) && (strchr(*str, '&') == NULL))
do_ent = FALSE;
/* Can't do, caller should figure out what to do... */
if (!UCCanTranslateFromTo(cs_from, cs_to)) {
if (cs_to < 0)
return NULL;
if (!do_ent && no_bytetrans)
return NULL;
no_bytetrans = TRUE;
} else if (cs_to < 0) {
do_ent = FALSE;
}
if (!do_ent && no_bytetrans)
return str;
p = *str;
if (!no_bytetrans) {
UCTransParams_clear(&T);
UCSetTransParams(&T, cs_from, &LYCharSet_UC[cs_from],
cs_to, &LYCharSet_UC[cs_to]);
from_is_utf8 = (BOOL) (LYCharSet_UC[cs_from].enc == UCT_ENC_UTF8);
output_utf8 = T.output_utf8;
repl_translated_C0 = T.repl_translated_C0;
puni = p;
} else if (do_ent) {
output_utf8 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8 ||
HText_hasUTF8OutputSet(HTMainText));
repl_translated_C0 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_8BIT_C0);
}
lowest_8 = LYlowest_eightbit[cs_to];
/*
* Create a buffer string seven times the length of the original, so we
* have plenty of room for expansions. - FM
*/
len = strlen(p) + 16;
q = p;
qs = q;
/* Create the HTChunk only if we need it */
#define CHUNK (chunk ? chunk : (chunk = HTChunkCreate2(128, len+1)))
#define REPLACE_STRING(s) \
if (q != qs) HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
HTChunkPuts(CHUNK, s); \
qs = q = *str
#define REPLACE_CHAR(c) if (q > p) { \
HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
qs = q = *str; \
*q++ = c; \
} else \
*q++ = c
/*
* Loop through string, making conversions as needed.
*
* The while() checks for a non-'\0' char only for the normal text states
* since other states may temporarily modify p or *p (which should be
* restored before S_done!) - kw
*/
while (*p || (state != S_text && state != S_nonascii_text)) {
switch (state) {
case S_text:
code = UCH(*p);
#ifdef KANJI_CODE_OVERRIDE
if (HTCJK == JAPANESE && last_kcode == SJIS) {
if (sjis_1st == '\0' && (IS_SJIS_HI1(code) || IS_SJIS_HI2(code))) {
sjis_1st = UCH(code);
} else if (sjis_1st && IS_SJIS_LO(code)) {
sjis_1st = '\0';
} else {
if (conv_jisx0201kana && 0xA1 <= code && code <= 0xDF) {
sjis_str[2] = '\0';
JISx0201TO0208_SJIS(UCH(code),
sjis_str, sjis_str + 1);
REPLACE_STRING(sjis_str);
p++;
continue;
}
}
}
#endif
if (*p == '\033') {
if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
state = S_esc;
if (stype == st_URL) {
REPLACE_STRING("%1B");
p++;
continue;
} else if (stype != st_HTML) {
p++;
continue;
} else {
*q++ = *p++;
continue;
}
} else if (!hidden) {
/*
* CJK handling not on, and not a hidden INPUT, so block
* escape. - FM
*/
state = S_next_char;
} else {
state = S_trans_byte;
}
} else {
state = (do_ent ? S_check_ent : S_trans_byte);
}
break;
case S_esc:
if (*p == '$') {
state = S_dollar;
*q++ = *p++;
continue;
} else if (*p == '(') {
state = S_paren;
*q++ = *p++;
continue;
} else {
state = S_text;
}
break;
case S_dollar:
if (*p == '@' || *p == 'B' || *p == 'A') {
state = S_nonascii_text;
*q++ = *p++;
continue;
} else if (*p == '(') {
state = S_dollar_paren;
*q++ = *p++;
continue;
} else {
state = S_text;
}
break;
case S_dollar_paren:
if (*p == 'C') {
state = S_nonascii_text;
*q++ = *p++;
continue;
} else {
state = S_text;
}
break;
case S_paren:
if (*p == 'B' || *p == 'J' || *p == 'T') {
state = S_text;
*q++ = *p++;
continue;
} else if (*p == 'I') {
state = S_nonascii_text;
*q++ = *p++;
continue;
} else {
state = S_text;
}
break;
case S_nonascii_text:
if (*p == '\033') {
if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
state = S_esc;
if (stype == st_URL) {
REPLACE_STRING("%1B");
p++;
continue;
} else if (stype != st_HTML) {
p++;
continue;
}
}
}
*q++ = *p++;
continue;
case S_trans_byte:
/* character translation goes here */
/*
* Don't do anything if we have no string, or if original AND
* target character sets are both iso-8859-1, or if we are in CJK
* mode.
*/
if (*p == '\0' || no_bytetrans) {
state = S_got_outchar;
break;
}
if (Back) {
int rev_c;
if ((*p) == HT_NON_BREAK_SPACE ||
(*p) == HT_EN_SPACE) {
if (plain_space) {
code = *p = ' ';
state = S_got_outchar;
break;
} else {
code = 160;
if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
(LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
state = S_got_outchar;
break;
} else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
|| (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
state = S_check_uni;
break;
} else {
*(unsigned char *) p = UCH(160);
}
}
} else if ((*p) == LY_SOFT_HYPHEN) {
code = 173;
if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
(LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
state = S_got_outchar;
break;
} else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
|| (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
state = S_check_uni;
break;
} else {
*(unsigned char *) p = UCH(173);
}
#ifdef EXP_JAPANESEUTF8_SUPPORT
} else if (output_utf8) {
if ((!strcmp(LYCharSet_UC[cs_from].MIMEname, "euc-jp") &&
(IS_EUC((unsigned char) (*p),
(unsigned char) (*(p + 1))))) ||
(!strcmp(LYCharSet_UC[cs_from].MIMEname, "shift_jis") &&
(IS_SJIS_2BYTE((unsigned char) (*p),
(unsigned char) (*(p + 1)))))) {
code = UCTransJPToUni(p, 2, cs_from);
p++;
state = S_check_uni;
break;
}
#endif
} else if (code < 127 || T.transp) {
state = S_got_outchar;
break;
}
rev_c = UCReverseTransChar(*p, cs_to, cs_from);
if (rev_c > 127) {
*p = (char) rev_c;
code = rev_c;
state = S_got_outchar;
break;
}
} else if (code < 127) {
state = S_got_outchar;
break;
}
if (from_is_utf8) {
if (((*p) & 0xc0) == 0xc0) {
puni = p;
code = UCGetUniFromUtf8String(&puni);
if (code <= 0) {
code = UCH(*p);
} else {
what = P_utf8;
}
}
} else if (use_lynx_specials && !Back &&
(code == 160 || code == 173) &&
(LYCharSet_UC[cs_from].enc == UCT_ENC_8859 ||
(LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
if (code == 160)
code = *p = HT_NON_BREAK_SPACE;
else if (code == 173)
code = *p = LY_SOFT_HYPHEN;
state = S_got_outchar;
break;
} else if (T.trans_to_uni) {
code = UCTransToUni(*p, cs_from);
if (code <= 0) {
/* What else can we do? */
code = UCH(*p);
}
} else if (!T.trans_from_uni) {
state = S_got_outchar;
break;
}
/*
* Substitute Lynx special character for 160 (nbsp) if
* use_lynx_specials is set.
*/
if (use_lynx_specials && !Back &&
(code == 160 || code == 173)) {
code = ((code == 160 ? HT_NON_BREAK_SPACE : LY_SOFT_HYPHEN));
state = S_got_outchar;
break;
}
state = S_check_uni;
break;
case S_check_ent:
if (*p == '&') {
char *pp = p + 1;
len = strlen(pp);
/*
* Check for a numeric entity. - FM
*/
if (*pp == '#' && len > 2 &&
(*(pp + 1) == 'x' || *(pp + 1) == 'X') &&
UCH(*(pp + 2)) < 127 &&
isxdigit(UCH(*(pp + 2)))) {
what = P_hex;
state = S_ncr;
} else if (*pp == '#' && len > 2 &&
UCH(*(pp + 1)) < 127 &&
isdigit(UCH(*(pp + 1)))) {
what = P_decimal;
state = S_ncr;
} else if (UCH(*pp) < 127 &&
isalpha(UCH(*pp))) {
what = P_named;
state = S_named;
} else {
state = S_trans_byte;
}
} else {
state = S_trans_byte;
}
break;
case S_ncr:
if (what == P_hex) {
p += 3;
} else { /* P_decimal */
p += 2;
}
cp = p;
while (*p && UCH(*p) < 127 &&
(what == P_hex ? isxdigit(UCH(*p)) :
isdigit(UCH(*p)))) {
p++;
}
/*
* Save the terminator and isolate the digit(s). - FM
*/
cpe = *p;
if (*p)
*p++ = '\0';
/*
* Show the numeric entity if the value:
* (1) Is greater than 255 and unhandled Unicode.
* (2) Is less than 32, and not valid and we don't have HTCJK set.
* (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
* (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
*/
if (UCScanCode(&code, cp, (BOOL) (what == P_hex))) {
code = LYcp1252ToUnicode(code);
state = S_check_uni;
} else {
state = S_recover;
break;
}
break;
case S_check_uni:
/*
* Show the numeric entity if the value:
* (2) Is less than 32, and not valid and we don't have HTCJK set.
* (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
* (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
*/
if ((code < 32 &&
code != 9 && code != 10 && code != 13 &&
!IS_CJK_TTY) ||
(code == 127 &&
!(HTPassHighCtrlRaw || IS_CJK_TTY)) ||
(code > 127 && code < 160 &&
!HTPassHighCtrlNum)) {
state = S_recover;
break;
}
/*
* Convert the value as an unsigned char, hex escaped if isURL is
* set and it's 8-bit, and then recycle the terminator if it is not
* a semicolon. - FM
*/
if (code > 159 && stype == st_URL) {
state = S_got_oututf8;
break;
}
/*
* For 160 (nbsp), use that value if it's a hidden INPUT, otherwise
* use an ASCII space (32) if plain_space is TRUE, otherwise use
* the Lynx special character. - FM
*/
if (code == 160) {
if (plain_space) {
code = ' ';
state = S_got_outchar;
break;
} else if (use_lynx_specials) {
code = HT_NON_BREAK_SPACE;
state = S_got_outchar;
break;
} else if ((hidden && !Back)
|| (LYCharSet_UC[cs_to].codepoints & UCT_CP_SUPERSETOF_LAT1)
|| LYCharSet_UC[cs_to].enc == UCT_ENC_8859
|| (LYCharSet_UC[cs_to].like8859 &
UCT_R_8859SPECL)) {
state = S_got_outchar;
break;
} else if (
(LYCharSet_UC[cs_to].repertoire & UCT_REP_SUPERSETOF_LAT1)) {
; /* nothing, may be translated later */
} else {
code = ' ';
state = S_got_outchar;
break;
}
}
/*
* For 173 (shy), use that value if it's a hidden INPUT, otherwise
* ignore it if plain_space is TRUE, otherwise use the Lynx special
* character. - FM
*/
if (code == 173) {
if (plain_space) {
replace_buf[0] = '\0';
state = S_got_outstring;
break;
} else if (Back &&
!(LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
(LYCharSet_UC[cs_to].like8859 &
UCT_R_8859SPECL))) {
; /* nothing, may be translated later */
} else if (hidden || Back) {
state = S_got_outchar;
break;
} else if (use_lynx_specials) {
code = LY_SOFT_HYPHEN;
state = S_got_outchar;
break;
}
}
/*
* Seek a translation from the chartrans tables.
*/
if ((uck = UCTransUniChar(code,
cs_to)) >= 32 &&
uck < 256 &&
(uck < 127 || uck >= lowest_8)) {
code = uck;
state = S_got_outchar;
break;
} else if ((uck == -4 ||
(repl_translated_C0 &&
uck > 0 && uck < 32)) &&
/*
* Not found; look for replacement string.
*/
UCTransUniCharStr(replace_buf,
60, code,
cs_to,
0) >= 0) {
state = S_got_outstring;
break;
}
if (output_utf8 &&
code > 127 && code < 0x7fffffffL) {
state = S_got_oututf8;
break;
}
/*
* For 8194 (ensp), 8195 (emsp), or 8201 (thinsp), use the
* character reference if it's a hidden INPUT, otherwise use an
* ASCII space (32) if plain_space is TRUE, otherwise use the Lynx
* special character. - FM
*/
if (code == 8194 || code == 8195 || code == 8201) {
if (hidden) {
state = S_recover;
} else if (plain_space) {
code = ' ';
state = S_got_outchar;
} else {
code = HT_EN_SPACE;
state = S_got_outchar;
}
break;
/*
* Ignore 8204 (zwnj), 8205 (zwj) 8206 (lrm), and 8207 (rlm),
* for now, if we got this far without finding a representation
* for them.
*/
} else if (code == 8204 || code == 8205 ||
code == 8206 || code == 8207) {
CTRACE((tfp, "LYUCFullyTranslateString: Ignoring '%"
PRI_UCode_t "'.\n", code));
replace_buf[0] = '\0';
state = S_got_outstring;
break;
/*
* Show the numeric entity if the value: (1) Is greater than
* 255 and unhandled Unicode.
*/
} else if (code > 255) {
/*
* Illegal or not yet handled value. Return "&#" verbatim and
* continue from there. - FM
*/
state = S_recover;
break;
/*
* If it's ASCII, or is 8-bit but HTPassEightBitNum is set or
* the character set is "ISO Latin 1", use it's value. - FM
*/
} else if (code < 161 ||
(code < 256 &&
(HTPassEightBitNum || cs_to == LATIN1))) {
/*
* No conversion needed.
*/
state = S_got_outchar;
break;
/* The following disabled section doesn't make sense any more.
* It used to make sense in the past, when S_check_named would
* look in "old style" tables in addition to what it does now.
* Disabling of going to S_check_name here prevents endless
* looping between S_check_uni and S_check_names states, which
* could occur here for Latin 1 codes for some cs_to if they
* had no translation in that cs_to. Normally all cs_to
* *should* now have valid translations via UCTransUniChar or
* UCTransUniCharStr for all Latin 1 codes, so that we would
* not get here anyway, and no loop could occur. Still, if we
* *do* get here, FALL THROUGH to case S_recover now. - kw
*/
#if 0
/*
* If we get to here, convert and handle the character as a
* named entity. - FM
*/
} else {
name = HTMLGetEntityName(code - 160);
state = S_check_name;
break;
#endif
}
case S_recover:
if (what == P_decimal || what == P_hex) {
/*
* Illegal or not yet handled value. Return "&#" verbatim and
* continue from there. - FM
*/
*q++ = '&';
*q++ = '#';
if (what == P_hex)
*q++ = 'x';
if (cpe != '\0')
*(p - 1) = cpe;
p = cp;
state = S_done;
} else if (what == P_named) {
*cp = cpe;
*q++ = '&';
state = S_done;
} else if (!T.output_utf8 && stype == st_HTML && !hidden &&
!(HTPassEightBitRaw &&
UCH(*p) >= lowest_8)) {
sprintf(replace_buf, "U%.2" PRI_UCode_t "", code);
state = S_got_outstring;
} else {
puni = p;
code = UCH(*p);
state = S_got_outchar;
}
break;
case S_named:
cp = ++p;
while (*cp && UCH(*cp) < 127 &&
isalnum(UCH(*cp)))
cp++;
cpe = *cp;
*cp = '\0';
name = p;
state = S_check_name;
break;
case S_check_name:
/*
* Seek the Unicode value for the named entity.
*
* !!!! We manually recover the case of '=' terminator which is
* commonly found on query to CGI-scripts enclosed as href= URLs
* like "somepath/?x=1&yz=2" Without this dirty fix, submission of
* such URLs was broken if &yz string happened to be a recognized
* entity name. - LP
*/
if (((code = HTMLGetEntityUCValue(name)) > 0) &&
!((cpe == '=') && (stype == st_URL))) {
state = S_check_uni;
break;
}
/*
* Didn't find the entity. Return verbatim.
*/
state = S_recover;
break;
/* * * O U T P U T S T A T E S * * */
case S_got_oututf8:
if (code > 255 ||
(code >= 128 && LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8)) {
UCConvertUniToUtf8(code, replace_buf);
state = S_got_outstring;
} else {
state = S_got_outchar;
}
break;
case S_got_outstring:
if (what == P_decimal || what == P_hex) {
if (cpe != ';' && cpe != '\0')
*(--p) = cpe;
p--;
} else if (what == P_named) {
*cp = cpe;
p = (*cp != ';') ? (cp - 1) : cp;
} else if (what == P_utf8) {
p = puni;
}
if (replace_buf[0] == '\0') {
state = S_next_char;
break;
}
if (stype == st_URL) {
code = replace_buf[0]; /* assume string OK if first char is */
if (code >= 127 ||
(code < 32 && (code != 9 && code != 10 && code != 0))) {
state = S_put_urlstring;
break;
}
}
REPLACE_STRING(replace_buf);
state = S_next_char;
break;
case S_put_urlstring:
esc = HTEscape(replace_buf, URL_XALPHAS);
REPLACE_STRING(esc);
FREE(esc);
state = S_next_char;
break;
case S_got_outchar:
if (what == P_decimal || what == P_hex) {
if (cpe != ';' && cpe != '\0')
*(--p) = cpe;
p--;
} else if (what == P_named) {
*cp = cpe;
p = (*cp != ';') ? (cp - 1) : cp;
} else if (what == P_utf8) {
p = puni;
}
if (stype == st_URL &&
/* Not a full HTEscape, only for 8bit and ctrl chars */
(TOASCII(code) >= 127 || /* S/390 -- gil -- 1925 */
(code < ' ' && (code != '\t' && code != '\n')))) {
state = S_put_urlchar;
break;
} else if (!hidden && code == 10 && *p == 10
&& q != qs && *(q - 1) == 13) {
/*
* If this is not a hidden string, and the current char is the
* LF ('\n') of a CRLF pair, drop the CR ('\r'). - KW
*/
*(q - 1) = *p++;
state = S_done;
break;
}
*q++ = (char) code;
state = S_next_char;
break;
case S_put_urlchar:
*q++ = '%';
REPLACE_CHAR(hex[(TOASCII(code) >> 4) & 15]); /* S/390 -- gil -- 1944 */
REPLACE_CHAR(hex[(TOASCII(code) & 15)]);
/* fall through */
case S_next_char:
p++; /* fall through */
case S_done:
state = S_text;
what = P_text;
/* for next round */
}
}
*q = '\0';
if (chunk) {
HTChunkPutb(CHUNK, qs, (int) (q - qs + 1)); /* also terminates */
if (stype == st_URL || stype == st_other) {
LYTrimHead(chunk->data);
LYTrimTail(chunk->data);
}
StrAllocCopy(*str, chunk->data);
HTChunkFree(chunk);
} else {
if (stype == st_URL || stype == st_other) {
LYTrimHead(qs);
LYTrimTail(qs);
}
}
return str;
}
#undef REPLACE_CHAR
#undef REPLACE_STRING
BOOL LYUCTranslateHTMLString(char **str,
int cs_from,
int cs_to,
int use_lynx_specials,
int plain_space,
int hidden,
CharUtil_st stype)
{
BOOL ret = YES;
/* May reallocate *str even if cs_to == 0 */
if (!LYUCFullyTranslateString(str, cs_from, cs_to, TRUE,
use_lynx_specials, plain_space, hidden,
NO, stype)) {
ret = NO;
}
return ret;
}
BOOL LYUCTranslateBackFormData(char **str,
int cs_from,
int cs_to,
int plain_space)
{
char **ret;
/* May reallocate *str */
ret = (LYUCFullyTranslateString(str, cs_from, cs_to, FALSE,
NO, plain_space, YES,
YES, st_HTML));
return (BOOL) (ret != NULL);
}
/*
* Parse a parameter from an HTML META tag, i.e., the CONTENT.
*/
char *LYParseTagParam(char *from,
const char *name)
{
size_t len = strlen(name);
char *result = NULL;
char *string = from;
do {
if ((string = strchr(string, ';')) == NULL)
return NULL;
while (*string != '\0' && (*string == ';' || isspace(UCH(*string)))) {
string++;
}
if (strlen(string) < len)
return NULL;
} while (strncasecomp(string, name, (int) len) != 0);
string += len;
while (*string != '\0' && (isspace(UCH(*string)) || *string == '=')) {
string++;
}
StrAllocCopy(result, string);
len = 0;
while (isprint(UCH(string[len])) && !isspace(UCH(string[len]))) {
len++;
}
result[len] = '\0';
/*
* Strip single quotes, just in case.
*/
if (len > 2 && result[0] == '\'' && result[len - 1] == result[0]) {
result[len - 1] = '\0';
for (string = result; (string[0] = string[1]) != '\0'; ++string) ;
}
return result;
}
/*
* Given a refresh-URL content string, parses the delay time and the URL
* string. Ignore the remainder of the content.
*/
void LYParseRefreshURL(char *content,
char **p_seconds,
char **p_address)
{
char *cp;
char *cp1 = NULL;
char *Seconds = NULL;
/*
* Look for the Seconds field. - FM
*/
cp = LYSkipBlanks(content);
if (*cp && isdigit(UCH(*cp))) {
cp1 = cp;
while (*cp1 && isdigit(UCH(*cp1)))
cp1++;
StrnAllocCopy(Seconds, cp, (int) (cp1 - cp));
}
*p_seconds = Seconds;
*p_address = LYParseTagParam(content, "URL");
CTRACE((tfp,
"LYParseRefreshURL\n\tcontent: %s\n\tseconds: %s\n\taddress: %s\n",
content, NonNull(*p_seconds), NonNull(*p_address)));
}
/*
* This function processes META tags in HTML streams. - FM
*/
void LYHandleMETA(HTStructured * me, const BOOL *present,
const char **value,
char **include GCC_UNUSED)
{
char *http_equiv = NULL, *name = NULL, *content = NULL, *charset = NULL;
char *href = NULL, *id_string = NULL, *temp = NULL;
char *cp, *cp0, *cp1 = NULL;
int url_type = 0;
if (!me || !present)
return;
/*
* Load the attributes for possible use by Lynx. - FM
*/
if (present[HTML_META_HTTP_EQUIV] &&
non_empty(value[HTML_META_HTTP_EQUIV])) {
StrAllocCopy(http_equiv, value[HTML_META_HTTP_EQUIV]);
convert_to_spaces(http_equiv, TRUE);
LYUCTranslateHTMLString(&http_equiv, me->tag_charset, me->tag_charset,
NO, NO, YES, st_other);
if (*http_equiv == '\0') {
FREE(http_equiv);
}
}
if (present[HTML_META_NAME] &&
non_empty(value[HTML_META_NAME])) {
StrAllocCopy(name, value[HTML_META_NAME]);
convert_to_spaces(name, TRUE);
LYUCTranslateHTMLString(&name, me->tag_charset, me->tag_charset,
NO, NO, YES, st_other);
if (*name == '\0') {
FREE(name);
}
}
if (present[HTML_META_CONTENT] &&
non_empty(value[HTML_META_CONTENT])) {
/*
* Technically, we should be creating a comma-separated list, but META
* tags come one at a time, and we'll handle (or ignore) them as each
* is received. Also, at this point, we only trim leading and trailing
* blanks from the CONTENT value, without translating any named
* entities or numeric character references, because how we should do
* that depends on what type of information it contains, and whether or
* not any of it might be sent to the screen. - FM
*/
StrAllocCopy(content, value[HTML_META_CONTENT]);
convert_to_spaces(content, FALSE);
LYTrimHead(content);
LYTrimTail(content);
if (*content == '\0') {
FREE(content);
}
}
if (present[HTML_META_CHARSET] &&
non_empty(value[HTML_META_CHARSET])) {
StrAllocCopy(charset, value[HTML_META_CHARSET]);
convert_to_spaces(charset, TRUE);
LYUCTranslateHTMLString(&charset, me->tag_charset, me->tag_charset,
NO, NO, YES, st_other);
if (*charset == '\0') {
FREE(charset);
}
}
CTRACE((tfp,
"LYHandleMETA: HTTP-EQUIV=\"%s\" NAME=\"%s\" CONTENT=\"%s\" CHARSET=\"%s\"\n",
NONNULL(http_equiv),
NONNULL(name),
NONNULL(content),
NONNULL(charset)));
/*
* Check for a text/html Content-Type with a charset directive, if we
* didn't already set the charset via a server's header. - AAC & FM
*/
if (isEmpty(me->node_anchor->charset) &&
(charset ||
(!strcasecomp(NonNull(http_equiv), "Content-Type") && content))) {
LYUCcharset *p_in = NULL;
LYUCcharset *p_out = NULL;
if (charset) {
LYLowerCase(charset);
} else {
LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
NO, NO, YES, st_other);
LYLowerCase(content);
}
if ((cp1 = charset) != NULL ||
(cp1 = strstr(content, "charset")) != NULL) {
BOOL chartrans_ok = NO;
char *cp3 = NULL, *cp4;
int chndl;
if (!charset)
cp1 += 7;
while (*cp1 == ' ' || *cp1 == '=' || *cp1 == '"')
cp1++;
StrAllocCopy(cp3, cp1); /* copy to mutilate more */
for (cp4 = cp3; (*cp4 != '\0' && *cp4 != '"' &&
*cp4 != ';' && *cp4 != ':' &&
!WHITE(*cp4)); cp4++) {
; /* do nothing */
}
*cp4 = '\0';
cp4 = cp3;
chndl = UCGetLYhndl_byMIME(cp3);
#ifdef CAN_SWITCH_DISPLAY_CHARSET
/* Allow a switch to a more suitable display charset */
if (Switch_Display_Charset(chndl, SWITCH_DISPLAY_CHARSET_MAYBE)) {
/* UCT_STAGE_STRUCTURED and UCT_STAGE_HTEXT
should have the same setting for UCInfoStage. */
HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_STRUCTURED);
me->outUCLYhndl = current_char_set;
HTAnchor_setUCInfoStage(me->node_anchor,
current_char_set,
UCT_STAGE_HTEXT,
UCT_SETBY_MIME); /* highest priorty! */
HTAnchor_setUCInfoStage(me->node_anchor,
current_char_set,
UCT_STAGE_STRUCTURED,
UCT_SETBY_MIME); /* highest priorty! */
me->outUCI = HTAnchor_getUCInfoStage(me->node_anchor,
UCT_STAGE_HTEXT);
/* The SGML stage will be reset in change_chartrans_handling */
}
#endif
if (UCCanTranslateFromTo(chndl, current_char_set)) {
chartrans_ok = YES;
StrAllocCopy(me->node_anchor->charset, cp4);
HTAnchor_setUCInfoStage(me->node_anchor, chndl,
UCT_STAGE_PARSER,
UCT_SETBY_STRUCTURED);
} else if (chndl < 0) {
/*
* Got something but we don't recognize it.
*/
chndl = UCLYhndl_for_unrec;
if (chndl < 0) /* UCLYhndl_for_unrec not defined :-( */
chndl = UCLYhndl_for_unspec; /* always >= 0 */
if (UCCanTranslateFromTo(chndl, current_char_set)) {
chartrans_ok = YES;
HTAnchor_setUCInfoStage(me->node_anchor, chndl,
UCT_STAGE_PARSER,
UCT_SETBY_STRUCTURED);
}
}
if (chartrans_ok) {
p_in = HTAnchor_getUCInfoStage(me->node_anchor,
UCT_STAGE_PARSER);
p_out = HTAnchor_setUCInfoStage(me->node_anchor,
current_char_set,
UCT_STAGE_HTEXT,
UCT_SETBY_DEFAULT);
if (!p_out) {
/*
* Try again.
*/
p_out = HTAnchor_getUCInfoStage(me->node_anchor,
UCT_STAGE_HTEXT);
}
if (!strcmp(p_in->MIMEname, "x-transparent")) {
HTPassEightBitRaw = TRUE;
HTAnchor_setUCInfoStage(me->node_anchor,
HTAnchor_getUCLYhndl(me->node_anchor,
UCT_STAGE_HTEXT),
UCT_STAGE_PARSER,
UCT_SETBY_DEFAULT);
}
if (!strcmp(p_out->MIMEname, "x-transparent")) {
HTPassEightBitRaw = TRUE;
HTAnchor_setUCInfoStage(me->node_anchor,
HTAnchor_getUCLYhndl(me->node_anchor,
UCT_STAGE_PARSER),
UCT_STAGE_HTEXT,
UCT_SETBY_DEFAULT);
}
if ((p_in->enc != UCT_ENC_CJK)
#ifdef EXP_JAPANESEUTF8_SUPPORT
&& (p_in->enc != UCT_ENC_UTF8)
#endif
) {
HTCJK = NOCJK;
if (!(p_in->codepoints &
UCT_CP_SUBSETOF_LAT1) &&
chndl == current_char_set) {
HTPassEightBitRaw = TRUE;
}
} else if (p_out->enc == UCT_ENC_CJK) {
Set_HTCJK(p_in->MIMEname, p_out->MIMEname);
}
LYGetChartransInfo(me);
/*
* Update the chartrans info homologously to a Content-Type
* MIME header with a charset parameter. - FM
*/
if (me->UCLYhndl != chndl) {
HTAnchor_setUCInfoStage(me->node_anchor, chndl,
UCT_STAGE_MIME,
UCT_SETBY_STRUCTURED);
HTAnchor_setUCInfoStage(me->node_anchor, chndl,
UCT_STAGE_PARSER,
UCT_SETBY_STRUCTURED);
me->inUCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
UCT_STAGE_PARSER);
me->inUCI = HTAnchor_getUCInfoStage(me->node_anchor,
UCT_STAGE_PARSER);
}
UCSetTransParams(&me->T,
me->inUCLYhndl, me->inUCI,
me->outUCLYhndl, me->outUCI);
} else {
/*
* Cannot translate. If according to some heuristic the given
* charset and the current display character both are likely to
* be like ISO-8859 in structure, pretend we have some kind of
* match.
*/
BOOL given_is_8859 = (BOOL) (!StrNCmp(cp4, "iso-8859-", 9) &&
isdigit(UCH(cp4[9])));
BOOL given_is_8859like = (BOOL) (given_is_8859
|| !StrNCmp(cp4, "windows-", 8)
|| !StrNCmp(cp4, "cp12", 4)
|| !StrNCmp(cp4, "cp-12", 5));
BOOL given_and_display_8859like = (BOOL) (given_is_8859like &&
(strstr(LYchar_set_names[current_char_set],
"ISO-8859") ||
strstr(LYchar_set_names[current_char_set],
"windows-")));
if (given_is_8859) {
cp1 = &cp4[10];
while (*cp1 &&
isdigit(UCH((*cp1))))
cp1++;
*cp1 = '\0';
}
if (given_and_display_8859like) {
StrAllocCopy(me->node_anchor->charset, cp4);
HTPassEightBitRaw = TRUE;
}
HTAlert(*cp4 ? cp4 : me->node_anchor->charset);
}
FREE(cp3);
if (me->node_anchor->charset) {
CTRACE((tfp,
"LYHandleMETA: New charset: %s\n",
me->node_anchor->charset));
}
}
/*
* Set the kcode element based on the charset. - FM
*/
HText_setKcode(me->text, me->node_anchor->charset, p_in);
}
/*
* Make sure we have META name/value pairs to handle. - FM
*/
if (!(http_equiv || name) || !content)
goto free_META_copies;
/*
* Check for a no-cache Pragma
* or Cache-Control directive. - FM
*/
if (!strcasecomp(NonNull(http_equiv), "Pragma") ||
!strcasecomp(NonNull(http_equiv), "Cache-Control")) {
LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
NO, NO, YES, st_other);
if (!strcasecomp(content, "no-cache")) {
me->node_anchor->no_cache = TRUE;
HText_setNoCache(me->text);
}
/*
* If we didn't get a Cache-Control MIME header, and the META has one,
* convert to lowercase, store it in the anchor element, and if we
* haven't yet set no_cache, check whether we should. - FM
*/
if ((!me->node_anchor->cache_control) &&
!strcasecomp(NonNull(http_equiv), "Cache-Control")) {
LYLowerCase(content);
StrAllocCopy(me->node_anchor->cache_control, content);
if (me->node_anchor->no_cache == FALSE) {
cp0 = content;
while ((cp = strstr(cp0, "no-cache")) != NULL) {
cp += 8;
while (*cp != '\0' && WHITE(*cp))
cp++;
if (*cp == '\0' || *cp == ';') {
me->node_anchor->no_cache = TRUE;
HText_setNoCache(me->text);
break;
}
cp0 = cp;
}
if (me->node_anchor->no_cache == TRUE)
goto free_META_copies;
cp0 = content;
while ((cp = strstr(cp0, "max-age")) != NULL) {
cp += 7;
while (*cp != '\0' && WHITE(*cp))
cp++;
if (*cp == '=') {
cp++;
while (*cp != '\0' && WHITE(*cp))
cp++;
if (isdigit(UCH(*cp))) {
cp0 = cp;
while (isdigit(UCH(*cp)))
cp++;
if (*cp0 == '0' && cp == (cp0 + 1)) {
me->node_anchor->no_cache = TRUE;
HText_setNoCache(me->text);
break;
}
}
}
cp0 = cp;
}
}
}
/*
* Check for an Expires directive. - FM
*/
} else if (!strcasecomp(NonNull(http_equiv), "Expires")) {
/*
* If we didn't get an Expires MIME header, store it in the anchor
* element, and if we haven't yet set no_cache, check whether we
* should. Note that we don't accept a Date header via META tags,
* because it's likely to be untrustworthy, but do check for a Date
* header from a server when making the comparison. - FM
*/
LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
NO, NO, YES, st_other);
StrAllocCopy(me->node_anchor->expires, content);
if (me->node_anchor->no_cache == FALSE) {
if (!strcmp(content, "0")) {
/*
* The value is zero, which we treat as an absolute no-cache
* directive. - FM
*/
me->node_anchor->no_cache = TRUE;
HText_setNoCache(me->text);
} else if (me->node_anchor->date != NULL) {
/*
* We have a Date header, so check if the value is less than or
* equal to that. - FM
*/
if (LYmktime(content, TRUE) <=
LYmktime(me->node_anchor->date, TRUE)) {
me->node_anchor->no_cache = TRUE;
HText_setNoCache(me->text);
}
} else if (LYmktime(content, FALSE) == 0) {
/*
* We don't have a Date header, and the value is in past for
* us. - FM
*/
me->node_anchor->no_cache = TRUE;
HText_setNoCache(me->text);
}
}
/*
* Check for a Refresh directive. - FM
*/
} else if (!strcasecomp(NonNull(http_equiv), "Refresh")) {
char *Seconds = NULL;
LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
NO, NO, YES, st_other);
LYParseRefreshURL(content, &Seconds, &href);
if (Seconds) {
if (href) {
/*
* We found a URL field, so check it out. - FM
*/
if (!LYLegitimizeHREF(me, &href, TRUE, FALSE)) {
/*
* The specs require a complete URL, but this is a
* Netscapism, so don't expect the author to know that. -
* FM
*/
HTUserMsg(REFRESH_URL_NOT_ABSOLUTE);
/*
* Use the document's address as the base. - FM
*/
if (*href != '\0') {
temp = HTParse(href,
me->node_anchor->address, PARSE_ALL);
StrAllocCopy(href, temp);
FREE(temp);
} else {
StrAllocCopy(href, me->node_anchor->address);
HText_setNoCache(me->text);
}
} else {
/*
* Check whether to fill in localhost. - FM
*/
LYFillLocalFileURL(&href,
(me->inBASE ?
me->base_href : me->node_anchor->address));
}
/*
* Set the no_cache flag if the Refresh URL is the same as the
* document's address. - FM
*/
if (!strcmp(href, me->node_anchor->address)) {
HText_setNoCache(me->text);
}
} else {
/*
* We didn't find a URL field, so use the document's own
* address and set the no_cache flag. - FM
*/
StrAllocCopy(href, me->node_anchor->address);
HText_setNoCache(me->text);
}
/*
* Check for an anchor in http or https URLs. - FM
*/
cp = NULL;
#ifndef DONT_TRACK_INTERNAL_LINKS
/* id_string seems to be used wrong below if given.
not that it matters much. avoid setting it here. - kw */
if ((StrNCmp(href, "http", 4) == 0) &&
(cp = strchr(href, '#')) != NULL) {
StrAllocCopy(id_string, cp);
*cp = '\0';
}
#endif
if (me->inA) {
/*
* Ugh! The META tag, which is a HEAD element, is in an
* Anchor, which is BODY element. All we can do is close the
* Anchor and cross our fingers. - FM
*/
if (me->inBoldA == TRUE && me->inBoldH == FALSE)
HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
me->inBoldA = FALSE;
HText_endAnchor(me->text, me->CurrentANum);
me->inA = FALSE;
me->CurrentANum = 0;
}
me->CurrentA = HTAnchor_findChildAndLink
(
me->node_anchor, /* Parent */
id_string, /* Tag */
href, /* Addresss */
(HTLinkType *) 0); /* Type */
if (id_string)
*cp = '#';
FREE(id_string);
LYEnsureSingleSpace(me);
if (me->inUnderline == FALSE)
HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
HTML_put_string(me, "REFRESH(");
HTML_put_string(me, Seconds);
HTML_put_string(me, " sec):");
FREE(Seconds);
if (me->inUnderline == FALSE)
HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
HTML_put_character(me, ' ');
me->in_word = NO;
HText_beginAnchor(me->text, me->inUnderline, me->CurrentA);
if (me->inBoldH == FALSE)
HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
HTML_put_string(me, href);
FREE(href);
if (me->inBoldH == FALSE)
HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
HText_endAnchor(me->text, 0);
LYEnsureSingleSpace(me);
}
/*
* Check for a suggested filename via a Content-Disposition with a
* filename=name.suffix in it, if we don't already have it via a server
* header. - FM
*/
} else if (isEmpty(me->node_anchor->SugFname) &&
!strcasecomp((http_equiv ?
http_equiv : ""), "Content-Disposition")) {
cp = content;
while (*cp != '\0' && strncasecomp(cp, "filename", 8))
cp++;
if (*cp != '\0') {
cp = LYSkipBlanks(cp + 8);
if (*cp == '=')
cp++;
cp = LYSkipBlanks(cp);
if (*cp != '\0') {
StrAllocCopy(me->node_anchor->SugFname, cp);
if (*me->node_anchor->SugFname == '"') {
if ((cp = strchr((me->node_anchor->SugFname + 1),
'"')) != NULL) {
*(cp + 1) = '\0';
HTMIME_TrimDoubleQuotes(me->node_anchor->SugFname);
if (isEmpty(me->node_anchor->SugFname)) {
FREE(me->node_anchor->SugFname);
}
} else {
FREE(me->node_anchor->SugFname);
}
}
#if defined(UNIX) && !defined(DOSPATH)
/*
* If blanks are not legal for local filenames, replace them
* with underscores.
*/
if ((cp = me->node_anchor->SugFname) != NULL) {
while (*cp != '\0') {
if (isspace(UCH(*cp)))
*cp = '_';
++cp;
}
}
#endif
}
}
/*
* Check for a Set-Cookie directive. - AK
*/
} else if (!strcasecomp(NonNull(http_equiv), "Set-Cookie")) {
/*
* This will need to be updated when Set-Cookie/Set-Cookie2 handling is
* finalized. For now, we'll still assume "historical" cookies in META
* directives. - FM
*/
url_type = is_url(me->inBASE ?
me->base_href : me->node_anchor->address);
if (url_type == HTTP_URL_TYPE || url_type == HTTPS_URL_TYPE) {
LYSetCookie(content,
NULL,
(me->inBASE ?
me->base_href : me->node_anchor->address));
}
}
/*
* Free the copies. - FM
*/
free_META_copies:
FREE(http_equiv);
FREE(name);
FREE(content);
FREE(charset);
}
/*
* This function handles P elements in HTML streams.
* If start is TRUE it handles a start tag, and if
* FALSE, an end tag. We presently handle start
* and end tags identically, but this can lead to
* a different number of blank lines between the
* current paragraph and subsequent text when a P
* end tag is present or not in the markup. - FM
*/
void LYHandlePlike(HTStructured * me, const BOOL *present,
const char **value,
char **include GCC_UNUSED,
int align_idx,
int start)
{
if (TRUE) {
/*
* FIG content should be a true block, which like P inherits the
* current style. APPLET is like character elements or an ALT
* attribute, unless it content contains a block element. If we
* encounter a P in either's content, we set flags to treat the content
* as a block. - FM
*/
if (start) {
if (me->inFIG)
me->inFIGwithP = TRUE;
if (me->inAPPLET)
me->inAPPLETwithP = TRUE;
}
UPDATE_STYLE;
if (me->List_Nesting_Level >= 0) {
/*
* We're in a list. Treat P as an instruction to create one blank
* line, if not already present, then fall through to handle
* attributes, with the "second line" margins. - FM
*/
if (me->inP) {
if (me->inFIG || me->inAPPLET ||
me->inCAPTION || me->inCREDIT ||
me->sp->style->spaceAfter > 0 ||
(start && me->sp->style->spaceBefore > 0)) {
LYEnsureDoubleSpace(me);
} else {
LYEnsureSingleSpace(me);
}
}
} else if (me->sp[0].tag_number == HTML_ADDRESS) {
/*
* We're in an ADDRESS. Treat P as an instruction to start a
* newline, if needed, then fall through to handle attributes. -
* FM
*/
if (!HText_LastLineEmpty(me->text, FALSE)) {
HText_setLastChar(me->text, ' '); /* absorb white space */
HText_appendCharacter(me->text, '\r');
}
} else {
if (start) {
if (!(me->inLABEL && !me->inP)) {
HText_appendParagraph(me->text);
}
} else if (me->sp->style->spaceAfter > 0) {
LYEnsureDoubleSpace(me);
} else {
LYEnsureSingleSpace(me);
}
me->inLABEL = FALSE;
}
me->in_word = NO;
if (LYoverride_default_alignment(me)) {
me->sp->style->alignment = LYstyles(me->sp[0].tag_number)->alignment;
} else if ((me->List_Nesting_Level >= 0 &&
(me->sp->style->id == ST_DivCenter ||
me->sp->style->id == ST_DivLeft ||
me->sp->style->id == ST_DivRight)) ||
((me->Division_Level < 0) &&
(me->sp->style->id == ST_Normal ||
me->sp->style->id == ST_Preformatted))) {
me->sp->style->alignment = HT_LEFT;
} else {
me->sp->style->alignment = (short) me->current_default_alignment;
}
if (start) {
if (present && present[align_idx] && value[align_idx]) {
if (!strcasecomp(value[align_idx], "center") &&
!(me->List_Nesting_Level >= 0 && !me->inP))
me->sp->style->alignment = HT_CENTER;
else if (!strcasecomp(value[align_idx], "right") &&
!(me->List_Nesting_Level >= 0 && !me->inP))
me->sp->style->alignment = HT_RIGHT;
else if (!strcasecomp(value[align_idx], "left") ||
!strcasecomp(value[align_idx], "justify"))
me->sp->style->alignment = HT_LEFT;
}
}
/*
* Mark that we are starting a new paragraph and don't have any of it's
* text yet. - FM
*/
me->inP = FALSE;
}
return;
}
/*
* This function handles SELECT elements in HTML streams.
* If start is TRUE it handles a start tag, and if FALSE,
* an end tag. - FM
*/
void LYHandleSELECT(HTStructured * me, const BOOL *present,
const char **value,
char **include GCC_UNUSED,
int start)
{
int i;
if (start == TRUE) {
char *name = NULL;
BOOLEAN multiple = NO;
char *size = NULL;
/*
* Initialize the disable attribute.
*/
me->select_disabled = FALSE;
/*
* Check for unclosed TEXTAREA.
*/
if (me->inTEXTAREA) {
if (LYBadHTML(me)) {
LYShowBadHTML("Bad HTML: Missing TEXTAREA end tag\n");
}
}
/*
* Set to know we are in a select tag.
*/
me->inSELECT = TRUE;
if (!(present && present[HTML_SELECT_NAME] &&
non_empty(value[HTML_SELECT_NAME]))) {
StrAllocCopy(name, "");
} else if (strchr(value[HTML_SELECT_NAME], '&') == NULL) {
StrAllocCopy(name, value[HTML_SELECT_NAME]);
} else {
StrAllocCopy(name, value[HTML_SELECT_NAME]);
UNESCAPE_FIELDNAME_TO_STD(&name);
}
if (present && present[HTML_SELECT_MULTIPLE])
multiple = YES;
if (present && present[HTML_SELECT_DISABLED])
me->select_disabled = TRUE;
if (present && present[HTML_SELECT_SIZE] &&
non_empty(value[HTML_SELECT_SIZE])) {
/*
* Let the size be determined by the number of OPTIONs. - FM
*/
CTRACE((tfp, "LYHandleSELECT: Ignoring SIZE=\"%s\" for SELECT.\n",
value[HTML_SELECT_SIZE]));
}
if (me->inBoldH == TRUE &&
(multiple == NO || LYSelectPopups == FALSE)) {
HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
me->inBoldH = FALSE;
me->needBoldH = TRUE;
}
if (me->inUnderline == TRUE &&
(multiple == NO || LYSelectPopups == FALSE)) {
HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
me->inUnderline = FALSE;
}
if ((multiple == NO && LYSelectPopups == TRUE) &&
(me->sp[0].tag_number == HTML_PRE || me->inPRE == TRUE ||
!me->sp->style->freeFormat) &&
HText_LastLineSize(me->text, FALSE) > (LYcolLimit - 7)) {
/*
* Force a newline when we're using a popup in a PRE block and are
* within 7 columns from the right margin. This will allow for the
* '[' popup designator and help avoid a wrap in the underscore
* placeholder for the retracted popup entry in the HText
* structure. - FM
*/
HTML_put_character(me, '\n');
me->in_word = NO;
}
LYCheckForID(me, present, value, (int) HTML_SELECT_ID);
HText_beginSelect(name, ATTR_CS_IN, multiple, size);
FREE(name);
FREE(size);
me->first_option = TRUE;
} else {
/*
* Handle end tag.
*/
char *ptr;
/*
* Make sure we had a select start tag.
*/
if (!me->inSELECT) {
if (LYBadHTML(me)) {
LYShowBadHTML("Bad HTML: Unmatched SELECT end tag\n");
}
return;
}
/*
* Set to know that we are no longer in a select tag.
*/
me->inSELECT = FALSE;
/*
* Clear the disable attribute.
*/
me->select_disabled = FALSE;
/*
* Finish the data off.
*/
HTChunkTerminate(&me->option);
/*
* Finish the previous option.
*/
ptr = HText_setLastOptionValue(me->text,
me->option.data,
me->LastOptionValue,
LAST_ORDER,
me->LastOptionChecked,
me->UCLYhndl,
ATTR_CS_IN);
FREE(me->LastOptionValue);
me->LastOptionChecked = FALSE;
if (HTCurSelectGroupType == F_CHECKBOX_TYPE ||
LYSelectPopups == FALSE) {
/*
* Start a newline after the last checkbox/button option.
*/
LYEnsureSingleSpace(me);
} else {
/*
* Output popup box with the default option to screen, but use
* non-breaking spaces for output.
*/
if (ptr &&
me->sp[0].tag_number == HTML_PRE && strlen(ptr) > 6) {
/*
* The code inadequately handles OPTION fields in PRE tags.
* We'll put up a minimum of 6 characters, and if any more
* would exceed the wrap column, we'll ignore them.
*/
for (i = 0; i < 6; i++) {
if (*ptr == ' ')
HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
else
HText_appendCharacter(me->text, *ptr);
ptr++;
}
}
for (; non_empty(ptr); ptr++) {
if (*ptr == ' ')
HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
else
HText_appendCharacter(me->text, *ptr);
}
/*
* Add end option character.
*/
if (!me->first_option) {
HText_appendCharacter(me->text, ']');
HText_setLastChar(me->text, ']');
me->in_word = YES;
}
}
HTChunkClear(&me->option);
if (me->Underline_Level > 0 && me->inUnderline == FALSE) {
HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
me->inUnderline = TRUE;
}
if (me->needBoldH == TRUE && me->inBoldH == FALSE) {
HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
me->inBoldH = TRUE;
me->needBoldH = FALSE;
}
}
}
/*
* This function strips white characters and
* generally fixes up attribute values that
* were received from the SGML parser and
* are to be treated as partial or absolute
* URLs. - FM
*/
int LYLegitimizeHREF(HTStructured * me, char **href,
int force_slash,
int strip_dots)
{
int url_type = 0;
char *p = NULL;
char *pound = NULL;
const char *Base = NULL;
if (!me || !href || isEmpty(*href))
return (url_type);
if (!LYTrimStartfile(*href)) {
/*
* Collapse spaces in the actual URL, but just protect against tabs or
* newlines in the fragment, if present. This seeks to cope with
* atrocities inflicted on the Web by authoring tools such as
* Frontpage. - FM
*/
/* Before working on spaces check if we have any, usually none. */
p = LYSkipNonBlanks(*href);
if (*p) { /* p == first space character */
/* no reallocs below, all converted in place */
pound = findPoundSelector(*href);
if (pound != NULL && pound < p) {
convert_to_spaces(p, FALSE); /* done */
} else {
if (pound != NULL)
*pound = '\0'; /* mark */
/*
* No blanks really belong in the HREF,
* but if it refers to an actual file,
* it may actually have blanks in the name.
* Try to accommodate. See also HTParse().
*/
if (LYRemoveNewlines(p) || strchr(p, '\t') != 0) {
LYRemoveBlanks(p); /* a compromise... */
}
if (pound != NULL) {
p = strchr(p, '\0');
*pound = '#'; /* restore */
convert_to_spaces(pound, FALSE);
if (p < pound)
strcpy(p, pound);
}
}
}
}
if (**href == '\0')
return (url_type);
TRANSLATE_AND_UNESCAPE_TO_STD(href);
Base = me->inBASE ?
me->base_href : me->node_anchor->address;
url_type = is_url(*href);
if (!url_type && force_slash && **href == '.' &&
(!strcmp(*href, ".") || !strcmp(*href, "..")) &&
!isFILE_URL(Base)) {
/*
* The Fielding RFC/ID for resolving partial HREFs says that a slash
* should be on the end of the preceding symbolic element for "." and
* "..", but all tested browsers only do that for an explicit "./" or
* "../", so we'll respect the RFC/ID only if force_slash was TRUE and
* it's not a file URL. - FM
*/
StrAllocCat(*href, "/");
}
if ((!url_type && LYStripDotDotURLs && strip_dots && **href == '.') &&
!strncasecomp(Base, "http", 4)) {
/*
* We will be resolving a partial reference versus an http or https
* URL, and it has lead dots, which may be retained when resolving via
* HTParse(), but the request would fail if the first element of the
* resultant path is two dots, because no http or https server accepts
* such paths, and the current URL draft, likely to become an RFC, says
* that it's optional for the UA to strip them as a form of error
* recovery. So we will, recursively, for http/https URLs, like the
* "major market browsers" which made this problem so common on the
* Web, but we'll also issue a message about it, such that the bad
* partial reference might get corrected by the document provider. -
* FM
*/
char *temp = NULL, *path = NULL, *cp;
const char *str = "";
temp = HTParse(*href, Base, PARSE_ALL);
path = HTParse(temp, "", PARSE_PATH + PARSE_PUNCTUATION);
if (!StrNCmp(path, "/..", 3)) {
cp = (path + 3);
if (LYIsHtmlSep(*cp) || *cp == '\0') {
if (Base[4] == 's') {
str = "s";
}
CTRACE((tfp,
"LYLegitimizeHREF: Bad value '%s' for http%s URL.\n",
*href, str));
CTRACE((tfp, " Stripping lead dots.\n"));
if (!me->inBadHREF) {
HTUserMsg(BAD_PARTIAL_REFERENCE);
me->inBadHREF = TRUE;
}
}
if (*cp == '\0') {
StrAllocCopy(*href, "/");
} else if (LYIsHtmlSep(*cp)) {
while (!StrNCmp(cp, "/..", 3)) {
if (*(cp + 3) == '/') {
cp += 3;
continue;
} else if (*(cp + 3) == '\0') {
*(cp + 1) = '\0';
*(cp + 2) = '\0';
}
break;
}
StrAllocCopy(*href, cp);
}
}
FREE(temp);
FREE(path);
}
return (url_type);
}
/*
* This function checks for a Content-Base header,
* and if not present, a Content-Location header
* which is an absolute URL, and sets the BASE
* accordingly. If set, it will be replaced by
* any BASE tag in the HTML stream, itself. - FM
*/
void LYCheckForContentBase(HTStructured * me)
{
char *cp = NULL;
BOOL present[HTML_BASE_ATTRIBUTES];
const char *value[HTML_BASE_ATTRIBUTES];
int i;
if (!(me && me->node_anchor))
return;
if (me->node_anchor->content_base != NULL) {
/*
* We have a Content-Base value. Use it if it's non-zero length. - FM
*/
if (*me->node_anchor->content_base == '\0')
return;
StrAllocCopy(cp, me->node_anchor->content_base);
LYRemoveBlanks(cp);
} else if (me->node_anchor->content_location != NULL) {
/*
* We didn't have a Content-Base value, but do have a Content-Location
* value. Use it if it's an absolute URL. - FM
*/
if (*me->node_anchor->content_location == '\0')
return;
StrAllocCopy(cp, me->node_anchor->content_location);
LYRemoveBlanks(cp);
if (!is_url(cp)) {
FREE(cp);
return;
}
} else {
/*
* We had neither a Content-Base nor Content-Location value. - FM
*/
return;
}
/*
* If we collapsed to a zero-length value, ignore it. - FM
*/
if (*cp == '\0') {
FREE(cp);
return;
}
/*
* Pass the value to HTML_start_element as the HREF of a BASE tag. - FM
*/
for (i = 0; i < HTML_BASE_ATTRIBUTES; i++)
present[i] = NO;
present[HTML_BASE_HREF] = YES;
value[HTML_BASE_HREF] = (const char *) cp;
(*me->isa->start_element) (me, HTML_BASE, present, value,
0, 0);
FREE(cp);
}
/*
* This function creates NAMEd Anchors if a non-zero-length NAME
* or ID attribute was present in the tag. - FM
*/
void LYCheckForID(HTStructured * me, const BOOL *present,
const char **value,
int attribute)
{
HTChildAnchor *ID_A = NULL;
char *temp = NULL;
if (!(me && me->text))
return;
if (present && present[attribute]
&& non_empty(value[attribute])) {
/*
* Translate any named or numeric character references. - FM
*/
StrAllocCopy(temp, value[attribute]);
LYUCTranslateHTMLString(&temp, me->tag_charset, me->tag_charset,
NO, NO, YES, st_URL);
/*
* Create the link if we still have a non-zero-length string. - FM
*/
if ((temp[0] != '\0') &&
(ID_A = HTAnchor_findChildAndLink
(
me->node_anchor, /* Parent */
temp, /* Tag */
NULL, /* Addresss */
(HTLinkType *) 0))) { /* Type */
HText_beginAnchor(me->text, me->inUnderline, ID_A);
HText_endAnchor(me->text, 0);
}
FREE(temp);
}
}
/*
* This function creates a NAMEd Anchor for the ID string
* passed to it directly as an argument. It assumes the
* does not need checking for character references. - FM
*/
void LYHandleID(HTStructured * me, const char *id)
{
HTChildAnchor *ID_A = NULL;
if (!(me && me->text) ||
isEmpty(id))
return;
/*
* Create the link if we still have a non-zero-length string. - FM
*/
if ((ID_A = HTAnchor_findChildAndLink
(
me->node_anchor, /* Parent */
id, /* Tag */
NULL, /* Addresss */
(HTLinkType *) 0)) != NULL) { /* Type */
HText_beginAnchor(me->text, me->inUnderline, ID_A);
HText_endAnchor(me->text, 0);
}
}
/*
* This function checks whether we want to override
* the current default alignment for paragraphs and
* instead use that specified in the element's style
* sheet. - FM
*/
BOOLEAN LYoverride_default_alignment(HTStructured * me)
{
if (!me)
return NO;
switch (me->sp[0].tag_number) {
case HTML_BLOCKQUOTE:
case HTML_BQ:
case HTML_NOTE:
case HTML_FN:
case HTML_ADDRESS:
me->sp->style->alignment = HT_LEFT;
return YES;
default:
break;
}
return NO;
}
/*
* This function inserts newlines if needed to create double spacing,
* and sets the left margin for subsequent text to the second line
* indentation of the current style. - FM
*/
void LYEnsureDoubleSpace(HTStructured * me)
{
if (!me || !me->text)
return;
if (!HText_LastLineEmpty(me->text, FALSE)) {
HText_setLastChar(me->text, ' '); /* absorb white space */
HText_appendCharacter(me->text, '\r');
HText_appendCharacter(me->text, '\r');
} else if (!HText_PreviousLineEmpty(me->text, FALSE)) {
HText_setLastChar(me->text, ' '); /* absorb white space */
HText_appendCharacter(me->text, '\r');
} else if (me->List_Nesting_Level >= 0) {
HText_NegateLineOne(me->text);
}
me->in_word = NO;
return;
}
/*
* This function inserts a newline if needed to create single spacing,
* and sets the left margin for subsequent text to the second line
* indentation of the current style. - FM
*/
void LYEnsureSingleSpace(HTStructured * me)
{
if (!me || !me->text)
return;
if (!HText_LastLineEmpty(me->text, FALSE)) {
HText_setLastChar(me->text, ' '); /* absorb white space */
HText_appendCharacter(me->text, '\r');
} else if (me->List_Nesting_Level >= 0) {
HText_NegateLineOne(me->text);
}
me->in_word = NO;
return;
}
/*
* This function resets paragraph alignments for block
* elements which do not have a defined style sheet. - FM
*/
void LYResetParagraphAlignment(HTStructured * me)
{
if (!me)
return;
if (me->List_Nesting_Level >= 0 ||
((me->Division_Level < 0) &&
(me->sp->style->id == ST_Normal ||
me->sp->style->id == ST_Preformatted))) {
me->sp->style->alignment = HT_LEFT;
} else {
me->sp->style->alignment = (short) me->current_default_alignment;
}
return;
}
/*
* This example function checks whether the given anchor has
* an address with a file scheme, and if so, loads it into the
* the SGML parser's context->url element, which was passed as
* the second argument. The handle_comment() calling function in
* SGML.c then calls LYDoCSI() in LYUtils.c to insert HTML markup
* into the corresponding stream, homologously to an SSI by an
* HTTP server. - FM
*
* For functions similar to this but which depend on details of
* the HTML handler's internal data, the calling interface should
* be changed, and functions in SGML.c would have to make sure not
* to call such functions inappropriately (e.g., calling a function
* specific to the Lynx_HTML_Handler when SGML.c output goes to
* some other HTStructured object like in HTMLGen.c), or the new
* functions could be added to the SGML.h interface.
*/
BOOLEAN LYCheckForCSI(HTParentAnchor *anchor,
char **url)
{
if (!(anchor && anchor->address))
return FALSE;
if (!isFILE_URL(anchor->address))
return FALSE;
if (!LYisLocalHost(anchor->address))
return FALSE;
StrAllocCopy(*url, anchor->address);
return TRUE;
}
/*
* This function is called from the SGML parser to look at comments
* and see whether we should collect some info from them. Currently
* it only looks for comments with Message-Id and Subject info, in the
* exact form generated by MHonArc for archived mailing list. If found,
* the info is stored in the document's HTParentAnchor. It can later be
* used for generating a mail response.
*
* We are extra picky here because there isn't any official definition
* for these kinds of comments - we might (and still can) misinterpret
* arbitrary comments as something they aren't.
*
* If something doesn't look right, for example invalid characters, the
* strings are not stored. Mail responses will use something else as
* the subject, probably the document URL, and will not have an
* In-Reply-To header.
*
* All this is a hack - to do this the right way, mailing list archivers
* would have to agree on some better mechanism to make this kind of info
* from original mail headers available, for example using LINK. - kw
*/
BOOLEAN LYCommentHacks(HTParentAnchor *anchor,
const char *comment)
{
const char *cp;
size_t len;
if (comment == NULL)
return FALSE;
if (!(anchor && anchor->address))
return FALSE;
if (StrNCmp(comment, "!--X-Message-Id: ", 17) == 0) {
char *messageid = NULL;
char *p;
for (cp = comment + 17; *cp; cp++) {
if (UCH(*cp) >= 127 || !isgraph(UCH(*cp))) {
break;
}
}
if (strcmp(cp, " --")) {
return FALSE;
}
cp = comment + 17;
StrAllocCopy(messageid, cp);
/* This should be ok - message-id should only contain 7-bit ASCII */
if (!LYUCTranslateHTMLString(&messageid, 0, 0, NO, NO, YES, st_URL))
return FALSE;
for (p = messageid; *p; p++) {
if (UCH(*p) >= 127 || !isgraph(UCH(*p))) {
break;
}
}
if (strcmp(p, " --")) {
FREE(messageid);
return FALSE;
}
if ((p = strchr(messageid, '@')) == NULL || p[1] == '\0') {
FREE(messageid);
return FALSE;
}
p = messageid;
if ((len = strlen(p)) >= 8 && !strcmp(&p[len - 3], " --")) {
p[len - 3] = '\0';
} else {
FREE(messageid);
return FALSE;
}
if (HTAnchor_setMessageID(anchor, messageid)) {
FREE(messageid);
return TRUE;
} else {
FREE(messageid);
return FALSE;
}
}
if (StrNCmp(comment, "!--X-Subject: ", 14) == 0) {
char *subject = NULL;
char *p;
for (cp = comment + 14; *cp; cp++) {
if (UCH(*cp) >= 127 || !isprint(UCH(*cp))) {
return FALSE;
}
}
cp = comment + 14;
StrAllocCopy(subject, cp);
/* @@@
* This may not be the right thing for the subject - but mail
* subjects shouldn't contain 8-bit characters in raw form anyway.
* We have to unescape character entities, since that's what MHonArc
* seems to generate. But if after that there are 8-bit characters
* the string is rejected. We would probably not know correctly
* what charset to assume anyway - the mail sender's can differ from
* the archive's. And the code for sending mail cannot deal well
* with 8-bit characters - we should not put them in the Subject
* header in raw form, but don't have MIME encoding implemented.
* Someone may want to do more about this... - kw
*/
if (!LYUCTranslateHTMLString(&subject, 0, 0, NO, YES, NO, st_HTML))
return FALSE;
for (p = subject; *p; p++) {
if (UCH(*p) >= 127 || !isprint(UCH(*p))) {
FREE(subject);
return FALSE;
}
}
p = subject;
if ((len = strlen(p)) >= 4 && !strcmp(&p[len - 3], " --")) {
p[len - 3] = '\0';
} else {
FREE(subject);
return FALSE;
}
if (HTAnchor_setSubject(anchor, subject)) {
FREE(subject);
return TRUE;
} else {
FREE(subject);
return FALSE;
}
}
return FALSE;
}
/*
* Create the Title with any left-angle-brackets converted to < entities
* and any ampersands converted to & entities. - FM
*
* Convert 8-bit letters to &#xUUUU to avoid dependencies from display
* character set which may need changing. Do NOT convert any 8-bit chars
* if we have CJK display. - LP
*/
void LYformTitle(char **dst,
const char *src)
{
if (HTCJK == JAPANESE) {
char *tmp_buffer = NULL;
if ((tmp_buffer = (char *) malloc(strlen(src) + 1)) == 0)
outofmem(__FILE__, "LYformTitle");
assert(tmp_buffer != NULL);
switch (kanji_code) { /* 1997/11/22 (Sat) 09:28:00 */
case EUC:
TO_EUC((const unsigned char *) src, (unsigned char *) tmp_buffer);
break;
case SJIS:
TO_SJIS((const unsigned char *) src, (unsigned char *) tmp_buffer);
break;
default:
CTRACE((tfp, "\nLYformTitle: kanji_code is an unexpected value."));
strcpy(tmp_buffer, src);
break;
}
StrAllocCopy(*dst, tmp_buffer);
FREE(tmp_buffer);
} else {
StrAllocCopy(*dst, src);
}
}