#include <HTUtils.h>
#include <HTCJK.h>
#include <HTMLDTD.h>
#include <LYGlobalDefs.h>
#include <UCMap.h>
#include <UCDefs.h>
#include <LYCharSets.h>
#include <LYCharUtils.h>
#include <HTFont.h>
#include <GridText.h>
#include <LYCurses.h>
#include <LYStrings.h>
#include <LYexit.h>
#include <LYLeaks.h>
extern BOOL HTPassEightBitRaw;
extern BOOL HTPassEightBitNum;
extern BOOL HTPassHighCtrlRaw;
extern BOOL HTPassHighCtrlNum;
extern HTCJKlang HTCJK;
PUBLIC HTkcode kanji_code = NOKANJI;
PUBLIC BOOLEAN LYHaveCJKCharacterSet = FALSE;
PUBLIC BOOLEAN DisplayCharsetMatchLocale = TRUE;
extern void UCInit NOARGS;
extern int UCInitialized;
PUBLIC int LYNumCharsets = 0; /* Will be initialized later by UC_Register. */
PUBLIC int current_char_set = -1; /* will be intitialized later in LYMain.c */
PUBLIC CONST char** p_entity_values = NULL; /* Pointer, for HTML_put_entity()*/
/* obsolete and probably not used(???) */
/* will be initialized in HTMLUseCharacterSet */
/*
* INSTRUCTIONS for adding new character sets which do not have
* Unicode tables.
*
* Currently we only declare some charset's properties here
* (such as MIME names, etc.), it does not include real mapping.
*
* [We hope you need not correct/add old-style mapping
* as in ISO_LATIN1[] or SevenBitApproximations[] any more -
* it works now via new chartrans mechanism, but kept for compatibility only:
* we should cleanup the stuff, but this is not so easy...]
*
* There is a place marked "Add your new character sets HERE" in this file.
* Make up a character set and add it in the same
* style as the ISO_LATIN1 set below, giving it a unique name.
*
* Add the name of the set to LYCharSets.
* Similarly add the appropriate information to the tables below:
* LYchar_set_names, LYCharSet_UC, LYlowest_eightbit.
* These 4 tables all MUST have the same order.
* (And this is the order you will see in Lynx Options Menu,
* which is why few unicode-based charsets are listed here).
*
*/
/* Entity values -- for ISO Latin 1 local representation
**
** This MUST match exactly the table referred to in the DTD!
*/
PRIVATE CONST char * ISO_Latin1[] = {
"\306", /* capital AE diphthong (ligature) (Æ) - AElig */
"\301", /* capital A, acute accent (Á) - Aacute */
"\302", /* capital A, circumflex accent (Â) - Acirc */
"\300", /* capital A, grave accent (À) - Agrave */
"\305", /* capital A, ring - Aring (Å) */
"\303", /* capital A, tilde - Atilde (Ã) */
"\304", /* capital A, dieresis or umlaut mark (Ä) - Auml */
"\307", /* capital C, cedilla - Ccedil (Ç) */
"\320", /* capital Eth or D with stroke (Ð) - Dstrok */
"\320", /* capital Eth, Icelandic (Ð) - ETH */
"\311", /* capital E, acute accent (É) - Eacute */
"\312", /* capital E, circumflex accent (Ê) - Ecirc */
"\310", /* capital E, grave accent (È) - Egrave */
"\313", /* capital E, dieresis or umlaut mark (Ë) - Euml */
"\315", /* capital I, acute accent (Í) - Iacute */
"\316", /* capital I, circumflex accent (Î) - Icirc */
"\314", /* capital I, grave accent (È) - Igrave */
"\317", /* capital I, dieresis or umlaut mark (Ï) - Iuml */
"\321", /* capital N, tilde (Ñ) - Ntilde */
"\323", /* capital O, acute accent (Ó) - Oacute */
"\324", /* capital O, circumflex accent (Ô) - Ocirc */
"\322", /* capital O, grave accent (Ò) - Ograve */
"\330", /* capital O, slash (Ø) - Oslash */
"\325", /* capital O, tilde (Õ) - Otilde */
"\326", /* capital O, dieresis or umlaut mark (Ö) - Ouml */
"\336", /* capital THORN, Icelandic (Þ) - THORN */
"\332", /* capital U, acute accent (Ú) - Uacute */
"\333", /* capital U, circumflex accent (Û) - Ucirc */
"\331", /* capital U, grave accent (Ù) - Ugrave */
"\334", /* capital U, dieresis or umlaut mark (Ü) - Uuml */
"\335", /* capital Y, acute accent (Ý) - Yacute */
"\341", /* small a, acute accent (á) - aacute */
"\342", /* small a, circumflex accent (â) - acirc */
"\264", /* spacing acute (´) - acute */
"\346", /* small ae diphthong (ligature) (æ) - aelig */
"\340", /* small a, grave accent (à) - agrave */
"\046", /* ampersand (&) - amp */
"\345", /* small a, ring (å) - aring */
"\343", /* small a, tilde (ã) - atilde */
"\344", /* small a, dieresis or umlaut mark (ä) - auml */
"\246", /* broken vertical bar (¦) - brkbar */
"\246", /* broken vertical bar (¦) - brvbar */
"\347", /* small c, cedilla (ç) - ccedil */
"\270", /* spacing cedilla (¸) - cedil */
"\242", /* cent sign (¢) - cent */
"\251", /* copyright sign (©) - copy */
"\244", /* currency sign (¤) - curren */
"\260", /* degree sign (°) - deg */
"\250", /* spacing dieresis (¨) - die */
"\367", /* division sign (÷) - divide */
"\351", /* small e, acute accent (é) - eacute */
"\352", /* small e, circumflex accent (ê) - ecirc */
"\350", /* small e, grave accent (è) - egrave */
"-", /* dash the width of emsp - emdash */
"\002", /* emsp, em space - not collapsed NEVER CHANGE THIS - emsp */
"-", /* dash the width of ensp - endash */
"\002", /* ensp, en space - not collapsed NEVER CHANGE THIS - ensp */
"\360", /* small eth, Icelandic (ð) - eth */
"\353", /* small e, dieresis or umlaut mark (ë) - euml */
"\275", /* fraction 1/2 (½) - frac12 */
"\274", /* fraction 1/4 (¼) - frac14 */
"\276", /* fraction 3/4 (¾) - frac34 */
"\076", /* greater than (>) - gt */
"\257", /* spacing macron (¯) - hibar */
"\355", /* small i, acute accent (í) - iacute */
"\356", /* small i, circumflex accent (î) - icirc */
"\241", /* inverted exclamation mark (¡) - iexcl */
"\354", /* small i, grave accent (ì) - igrave */
"\277", /* inverted question mark (¿) - iquest */
"\357", /* small i, dieresis or umlaut mark (ï) - iuml */
"\253", /* angle quotation mark, left («) - laquo */
"\074", /* less than (<) - lt */
"\257", /* spacing macron (¯) - macr */
"-", /* dash the width of emsp - mdash */
"\265", /* micro sign (µ) - micro */
"\267", /* middle dot (·) - middot */
"\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
"-", /* dash the width of ensp - ndash */
"\254", /* negation sign (¬) - not */
"\361", /* small n, tilde (ñ) - ntilde */
"\363", /* small o, acute accent (ó) - oacute */
"\364", /* small o, circumflex accent (ô) - ocirc */
"\362", /* small o, grave accent (ò) - ograve */
"\252", /* feminine ordinal indicator (ª) - ordf */
"\272", /* masculine ordinal indicator (º) - ordm */
"\370", /* small o, slash (ø) - oslash */
"\365", /* small o, tilde (õ) - otilde */
"\366", /* small o, dieresis or umlaut mark (ö) - ouml */
"\266", /* paragraph sign (¶) - para */
"\261", /* plus-or-minus sign (±) - plusmn */
"\243", /* pound sign (£) - pound */
"\042", /* quote '"' (") - quot */
"\273", /* angle quotation mark, right (») - raquo */
"\256", /* circled R registered sign (®) - reg */
"\247", /* section sign (§) - sect */
"\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */
"\271", /* superscript 1 (¹) - sup1 */
"\262", /* superscript 2 (²) - sup2 */
"\263", /* superscript 3 (³) - sup3 */
"\337", /* small sharp s, German (sz ligature) (ß) - szlig */
"\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */
"\376", /* small thorn, Icelandic (þ) - thorn */
"\327", /* multiplication sign (×) - times */
"(TM)", /* circled TM trade mark sign (™) - trade */
"\372", /* small u, acute accent (ú) - uacute */
"\373", /* small u, circumflex accent (û) - ucirc */
"\371", /* small u, grave accent (ù) - ugrave */
"\250", /* spacing dieresis (¨) - uml */
"\374", /* small u, dieresis or umlaut mark (ü) - uuml */
"\375", /* small y, acute accent (ý) - yacute */
"\245", /* yen sign (¥) - yen */
"\377", /* small y, dieresis or umlaut mark (ÿ) - yuml */
};
/* Entity values -- 7 bit character approximations
**
** This MUST match exactly the table referred to in the DTD!
*/
PUBLIC CONST char * SevenBitApproximations[] = {
"AE", /* capital AE diphthong (ligature) (Æ) - AElig */
"A", /* capital A, acute accent (Á) - Aacute */
"A", /* capital A, circumflex accent (Â) - Acirc */
"A", /* capital A, grave accent (À) - Agrave */
"A", /* capital A, ring - Aring (Å) */
"A", /* capital A, tilde - Atilde (Ã) */
#ifdef LY_UMLAUT
"Ae", /* capital A, dieresis or umlaut mark (Ä) - Auml*/
#else
"A", /* capital A, dieresis or umlaut mark (Ä) - Auml*/
#endif /* LY_UMLAUT */
"C", /* capital C, cedilla (Ç) - Ccedil */
"Dj", /* capital D with stroke (Ð) - Dstrok */
"DH", /* capital Eth, Icelandic (Ð) - ETH */
"E", /* capital E, acute accent (É) - Eacute */
"E", /* capital E, circumflex accent (Ê) - Ecirc */
"E", /* capital E, grave accent (È) - Egrave */
"E", /* capital E, dieresis or umlaut mark (Ë) - Euml */
"I", /* capital I, acute accent (Í) - Iacute */
"I", /* capital I, circumflex accent (Î) - Icirc */
"I", /* capital I, grave accent (Ì) - Igrave */
"I", /* capital I, dieresis or umlaut mark (Ï) - Iuml */
"N", /* capital N, tilde - Ntilde (Ñ) */
"O", /* capital O, acute accent (Ó) - Oacute */
"O", /* capital O, circumflex accent (Ô) - Ocirc */
"O", /* capital O, grave accent (Ò) - Ograve */
"O", /* capital O, slash (Ø) - Oslash */
"O", /* capital O, tilde (Õ) - Otilde */
#ifdef LY_UMLAUT
"Oe", /* capital O, dieresis or umlaut mark (Ö) - Ouml */
#else
"O", /* capital O, dieresis or umlaut mark (Ö) - Ouml */
#endif /* LY_UMLAUT */
"P", /* capital THORN, Icelandic (Þ) - THORN */
"U", /* capital U, acute accent (Ú) - Uacute */
"U", /* capital U, circumflex accent (Û) - Ucirc */
"U", /* capital U, grave accent (Ù) - Ugrave */
#ifdef LY_UMLAUT
"Ue", /* capital U, dieresis or umlaut mark (Ü) - Uuml */
#else
"U", /* capital U, dieresis or umlaut mark (Ü) - Uuml */
#endif /* LY_UMLAUT */
"Y", /* capital Y, acute accent (Ý) - Yacute */
"a", /* small a, acute accent (á) - aacute */
"a", /* small a, circumflex accent (â) - acirc */
"'", /* spacing acute (´) - acute */
"ae", /* small ae diphthong (ligature) (æ) - aelig */
"`a", /* small a, grave accent (è) - agrave */
"&", /* ampersand (&) - amp */
"a", /* small a, ring (å) - aring */
"a", /* small a, tilde (ã) - atilde */
#ifdef LY_UMLAUT
"ae", /* small a, dieresis or umlaut mark (ä) - auml */
#else
"a", /* small a, dieresis or umlaut mark (ä) - auml */
#endif /* LY_UMLAUT */
"|", /* broken vertical bar (¦) - brkbar */
"|", /* broken vertical bar (¦) - brvbar */
"c", /* small c, cedilla (ç) - ccedil */
",", /* spacing cedilla (¸) - cedil */
"-c-", /* cent sign (¢) - cent */
"(c)", /* copyright sign (©) - copy */
"CUR", /* currency sign (¤) - curren */
"DEG", /* degree sign (°) - deg */
"\042", /* spacing dieresis (¨) - die */
"/", /* division sign (÷) - divide */
"e", /* small e, acute accent (é) - eacute */
"e", /* small e, circumflex accent (ê) - ecirc */
"e", /* small e, grave accent (è) - egrave */
"-", /* dash the width of emsp - emdash */
"\002", /* emsp NEVER CHANGE THIS - emsp */
"-", /* dash the width of ensp - endash */
"\002", /* ensp NEVER CHANGE THIS - ensp */
"dh", /* small eth, Icelandic eth (ð) */
"e", /* small e, dieresis or umlaut mark (ë) - euml */
" 1/2", /* fraction 1/2 (½) - frac12 */
" 1/4", /* fraction 1/4 (¼) - frac14 */
" 3/4", /* fraction 3/4 (¾) - frac34 */
">", /* greater than (>) - gt */
"-", /* spacing macron (¯) - hibar */
"i", /* small i, acute accent (í) - iacute */
"i", /* small i, circumflex accent (î) - icirc*/
"!", /* inverted exclamation mark (¡) - iexcl */
"`i", /* small i, grave accent (ì) - igrave */
"?", /* inverted question mark (¿) - iquest */
"i", /* small i, dieresis or umlaut mark (ï) - iuml */
"<<", /* angle quotation mark, left («) - laquo */
"<", /* less than - lt (<) */
"-", /* spacing macron (¯) - macr */
"-", /* dash the width of emsp - mdash */
"u", /* micro sign (µ) - micro */
".", /* middle dot (·) - middot */
"\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
"-", /* dash the width of ensp - ndash */
"NOT", /* negation sign (¬) - not */
"n", /* small n, tilde (ñ) - ntilde */
"o", /* small o, acute accent (ó) - oacute */
"o", /* small o, circumflex accent (ô) - ocirc */
"o", /* small o, grave accent (ò) - ograve */
"-a", /* feminine ordinal indicator (ª) - ordf */
"-o", /* masculine ordinal indicator (º) - ordm */
"o", /* small o, slash (ø) - oslash */
"o", /* small o, tilde (õ) - otilde */
#ifdef LY_UMLAUT
"oe", /* small o, dieresis or umlaut mark (ö) - ouml */
#else
"o", /* small o, dieresis or umlaut mark (ö) - ouml */
#endif /* LY_UMLAUT */
"P:", /* paragraph sign (¶) - para */
"+-", /* plus-or-minus sign (±) - plusmn */
"-L-", /* pound sign (£) - pound */
"\"", /* quote '"' (") - quot */
">>", /* angle quotation mark, right (») - raquo */
"(R)", /* circled R registered sign (®) - reg */
"S:", /* section sign (§) - sect */
"\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */
"^1", /* superscript 1 (¹) - sup1 */
"^2", /* superscript 2 (²) - sup2 */
"^3", /* superscript 3 (³) - sup3 */
"ss", /* small sharp s, German (sz ligature) (ß) - szlig */
"\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */
"p", /* small thorn, Icelandic (þ) - thorn */
"*", /* multiplication sign (×) - times */
"(TM)", /* circled TM trade mark sign (™) - trade */
"u", /* small u, acute accent (ú) - uacute */
"u", /* small u, circumflex accent (û) - ucirc */
"u", /* small u, grave accent (ù) - ugrave */
"\042", /* spacing dieresis (¨) - uml */
#ifdef LY_UMLAUT
"ue", /* small u, dieresis or umlaut mark (ü) - uuml */
#else
"u", /* small u, dieresis or umlaut mark (ü) - uuml */
#endif /* LY_UMLAUT */
"y", /* small y, acute accent (ý) - yacute */
"YEN", /* yen sign (¥) - yen */
"y", /* small y, dieresis or umlaut mark (ÿ) - yuml */
};
/*
* Add your new character sets HERE (but only if you
* can't construct Unicode tables for them). - FM
*/
/*
* Add the array name to LYCharSets
*/
PUBLIC CONST char ** LYCharSets[MAXCHARSETS]={
ISO_Latin1, /* ISO Latin 1 */
SevenBitApproximations, /* DosLatin1 (cp850) */
SevenBitApproximations, /* WinLatin1 (cp1252) */
SevenBitApproximations, /* DosLatinUS (cp437) */
SevenBitApproximations, /* DEC Multinational */
SevenBitApproximations, /* Macintosh (8 bit) */
SevenBitApproximations, /* NeXT character set */
SevenBitApproximations, /* Chinese */
SevenBitApproximations, /* Japanese (EUC-JP) */
SevenBitApproximations, /* Japanese (Shift_JIS) */
SevenBitApproximations, /* Korean */
SevenBitApproximations, /* Taipei (Big5) */
SevenBitApproximations, /* Vietnamese (VISCII) */
SevenBitApproximations, /* 7 Bit Approximations */
SevenBitApproximations, /* Transparent */
};
/*
* Add the name that the user will see below.
* The order of LYCharSets and LYchar_set_names MUST be the same
*/
PUBLIC CONST char * LYchar_set_names[MAXCHARSETS + 1]={
"Western (ISO-8859-1)",
"Western (cp850)",
"Western (windows-1252)",
"IBM PC US codepage (cp437)",
"DEC Multinational",
"Macintosh (8 bit)",
"NeXT character set",
"Chinese",
"Japanese (EUC-JP)",
"Japanese (Shift_JIS)",
"Korean",
"Taipei (Big5)",
"Vietnamese (VISCII)",
"7 bit approximations (US-ASCII)",
"Transparent",
(char *) 0
};
/*
* Associate additional pieces of info with each of the charsets listed
* above.
* Will be automatically modified (and extended) by charset translations
* which are loaded using the chartrans mechanism.
* Most important piece of info to put here is a MIME charset name.
* Used for chartrans (see UCDefs.h).
* The order of LYCharSets and LYCharSet_UC MUST be the same.
*
* Note that most of the charsets added by the new mechanism in src/chrtrans
* don't show up here at all. They don't have to.
*/
PUBLIC LYUCcharset LYCharSet_UC[MAXCHARSETS]=
{
/*
* Zero position placeholder and HTMLGetEntityUCValue() reference. - FM
*/
{-1,"iso-8859-1", UCT_ENC_8BIT,0,
UCT_REP_IS_LAT1,
UCT_CP_IS_LAT1, UCT_R_LAT1,UCT_R_LAT1},
/*
* Placeholders for Unicode tables. - FM
*/
{-1,"cp850", UCT_ENC_8BIT,0,
UCT_REP_SUPERSETOF_LAT1,
0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"windows-1252", UCT_ENC_8BIT,0,
UCT_REP_SUPERSETOF_LAT1,
0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"cp437", UCT_ENC_8BIT,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"dec-mcs", UCT_ENC_8BIT,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"macintosh", UCT_ENC_8BIT,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"next", UCT_ENC_8BIT,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
/*
* There is no strict correlation for the next five, since the transfer
* charset gets decoded into Display Char Set by the CJK code (separate
* from Unicode mechanism). For now we use the MIME name that describes
* what is output to the terminal. - KW
*/
{-1,"euc-cn", UCT_ENC_CJK,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"euc-jp", UCT_ENC_CJK,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"shift_jis", UCT_ENC_CJK,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"euc-kr", UCT_ENC_CJK,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"big5", UCT_ENC_CJK,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
/*
* Placeholders for Unicode tables. - FM
*/
{-1,"viscii", UCT_ENC_8BIT_C0,0,0,0, UCT_R_8BIT,UCT_R_ASCII},
{-1,"us-ascii", UCT_ENC_7BIT,0,
UCT_REP_SUBSETOF_LAT1,
UCT_CP_SUBSETOF_LAT1, UCT_R_ASCII,UCT_R_ASCII},
/*
* Placeholder for non-translation mode. - FM
*/
{-1,"x-transparent", UCT_ENC_8BIT,0,0,0, UCT_R_8BIT,UCT_R_ASCII}
};
/*
* Add the code of the the lowest character with the high bit set
* that can be directly displayed.
* The order of LYCharSets and LYlowest_eightbit MUST be the same.
*
* (If charset have chartrans unicode table,
* LYlowest_eightbit will be verified/modified anyway.)
*/
PUBLIC int LYlowest_eightbit[MAXCHARSETS]={
160, /* ISO Latin 1 */
128, /* DosLatin1 (cp850) */
130, /* WinLatin1 (cp1252) */
128, /* DosLatinUS (cp437) */
160, /* DEC Multinational */
128, /* Macintosh (8 bit) */
128, /* NeXT character set */
128, /* Chinese */
128, /* Japanese (EUC) */
128, /* Japanese (SJIS) */
128, /* Korean */
128, /* Taipei (Big5) */
128, /* Vietnamese (VISCII) */
999, /* 7 bit approximations */
128 /* Transparent (???) */
};
/*
* Function to set the handling of selected character sets
* based on the current LYUseDefaultRawMode value. - FM
*/
PUBLIC void HTMLSetCharacterHandling ARGS1(int,i)
{
int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);
if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {
HTCJK = NOCJK;
kanji_code = NOKANJI;
if (i == chndl)
LYRawMode = LYUseDefaultRawMode;
else
LYRawMode = (!LYUseDefaultRawMode);
HTPassEightBitNum =
((LYCharSet_UC[i].codepoints & UCT_CP_SUPERSETOF_LAT1) ||
(LYCharSet_UC[i].like8859 & UCT_R_HIGH8BIT));
if (LYRawMode) {
HTPassEightBitRaw = (LYlowest_eightbit[i] <= 160);
} else {
HTPassEightBitRaw = FALSE;
}
if (LYRawMode || i == chndl) {
HTPassHighCtrlRaw = (LYlowest_eightbit[i] <= 130);
} else {
HTPassHighCtrlRaw = FALSE;
}
HTPassHighCtrlNum = FALSE;
} else { /* CJK encoding: */
CONST char *mime = LYCharSet_UC[i].MIMEname;
if (!strcmp(mime, "euc-cn")) {
HTCJK = CHINESE;
kanji_code = EUC;
} else if (!strcmp(mime, "euc-jp")) {
HTCJK = JAPANESE;
kanji_code = EUC;
} else if (!strcmp(mime, "shift_jis")) {
HTCJK = JAPANESE;
kanji_code = SJIS;
} else if (!strcmp(mime, "euc-kr")) {
HTCJK = KOREAN;
kanji_code = EUC;
} else if (!strcmp(mime, "big5")) {
HTCJK = TAIPEI;
kanji_code = EUC;
}
/* for any CJK: */
if (!LYUseDefaultRawMode)
HTCJK = NOCJK;
LYRawMode = (HTCJK != NOCJK) ? TRUE : FALSE;
HTPassEightBitRaw = FALSE;
HTPassEightBitNum = FALSE;
HTPassHighCtrlRaw = (HTCJK != NOCJK) ? TRUE : FALSE;
HTPassHighCtrlNum = FALSE;
}
/*
* Comment for coding below:
* UCLYhndl_for_unspec is "current" state with LYRawMode,
* but UCAssume_MIMEcharset is independent from LYRawMode:
* holds the history and may be changed from 'O'ptions menu only. - LP
*/
if (LYRawMode) {
UCLYhndl_for_unspec = i; /* UCAssume_MIMEcharset not changed! */
} else {
if (chndl != i &&
(LYCharSet_UC[i].enc != UCT_ENC_CJK ||
LYCharSet_UC[chndl].enc != UCT_ENC_CJK)) {
UCLYhndl_for_unspec = chndl; /* fall to UCAssume_MIMEcharset */
} else {
UCLYhndl_for_unspec = LATIN1; /* UCAssume_MIMEcharset not changed! */
}
}
#ifdef USE_SLANG
if (LYlowest_eightbit[i] > 191) {
/*
* Higher than this may output cntrl chars to screen. - KW
*/
SLsmg_Display_Eight_Bit = 191;
} else {
SLsmg_Display_Eight_Bit = LYlowest_eightbit[i];
}
#endif /* USE_SLANG */
ena_csi((LYlowest_eightbit[current_char_set] > 155));
return;
}
/*
* Function to set HTCJK based on "in" and "out" charsets.
*/
PUBLIC void Set_HTCJK ARGS2(
CONST char *, inMIMEname,
CONST char *, outMIMEname)
{
if (LYRawMode) {
if ((!strcmp(inMIMEname, "euc-jp") ||
!strcmp(inMIMEname, "shift_jis")) &&
(!strcmp(outMIMEname, "euc-jp") ||
!strcmp(outMIMEname, "shift_jis"))) {
HTCJK = JAPANESE;
} else if (!strcmp(inMIMEname, "euc-cn") &&
!strcmp(outMIMEname, "euc-cn")) {
HTCJK = CHINESE;
} else if (!strcmp(inMIMEname, "big5") &&
!strcmp(outMIMEname, "big5")) {
HTCJK = TAIPEI;
} else if (!strcmp(inMIMEname, "euc-kr") &&
!strcmp(outMIMEname, "euc-kr")) {
HTCJK = KOREAN;
} else {
HTCJK = NOCJK;
}
} else {
HTCJK = NOCJK;
}
}
/*
* Function to set the LYDefaultRawMode value
* based on the selected character set. - FM
*
* Currently unused: the default value so obvious
* that LYUseDefaultRawMode utilized directly by someone's mistake. - LP
*/
PRIVATE void HTMLSetRawModeDefault ARGS1(int,i)
{
LYDefaultRawMode = (LYCharSet_UC[i].enc == UCT_ENC_CJK);
return;
}
/*
* Function to set the LYUseDefaultRawMode value
* based on the selected character set and the
* current LYRawMode value. - FM
*/
PUBLIC void HTMLSetUseDefaultRawMode ARGS2(
int, i,
BOOLEAN, modeflag)
{
if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {
int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);
if (i == chndl)
LYUseDefaultRawMode = modeflag;
else
LYUseDefaultRawMode = (!modeflag);
} else /* CJK encoding: */
LYUseDefaultRawMode = modeflag;
return;
}
/*
* Function to set the LYHaveCJKCharacterSet value
* based on the selected character set. - FM
*/
PRIVATE void HTMLSetHaveCJKCharacterSet ARGS1(int,i)
{
LYHaveCJKCharacterSet = (LYCharSet_UC[i].enc == UCT_ENC_CJK);
return;
}
/*
* Function to set the DisplayCharsetMatchLocale value
* based on the selected character set.
* It is used in UPPER8 for 8bit case-insensitive search
* by matching def7_uni.tbl images. - LP
*/
PRIVATE void HTMLSetDisplayCharsetMatchLocale ARGS1(int,i)
{
BOOLEAN match;
if (LYHaveCJKCharacterSet) {
/*
** We have no intention to pass CJK via UCTransChar if that happened.
** Let someone from CJK correct this if necessary.
*/
DisplayCharsetMatchLocale = TRUE; /* old-style */
return;
} else if (strncasecomp(LYCharSet_UC[i].MIMEname, "cp", 2) ||
strncasecomp(LYCharSet_UC[i].MIMEname, "windows", 7)) {
/*
** Assume dos/windows displays usually on remote terminal, hence it
** rarely matches locale. (In fact, MS Windows codepoints locale are
** never seen on UNIX).
*/
match = FALSE;
} else {
match = TRUE; /* guess, but see below */
#if !defined(LOCALE)
match = FALSE;
#else
if (UCForce8bitTOUPPER) {
/*
** Force disable locale (from lynx.cfg)
*/
match = FALSE;
}
#endif
}
DisplayCharsetMatchLocale = match;
return;
}
/*
* lynx 2.8/2.7.2(and more early) compatibility code:
* "human-readable" charset names changes with time
* so we map that history names to MIME here
* to get old lynx.cfg and (especially) .lynxrc always recognized.
* Please update this table when you change "fullname" of any present charset.
*/
typedef struct _names_pairs {
CONST char * fullname;
CONST char * MIMEname;
} names_pairs;
PRIVATE CONST names_pairs OLD_charset_names[] = {
{"ISO Latin 1", "iso-8859-1"},
{"ISO Latin 2", "iso-8859-2"},
{"WinLatin1 (cp1252)", "windows-1252"},
{"DEC Multinational", "dec-mcs"},
{"Macintosh (8 bit)", "macintosh"},
{"NeXT character set", "next"},
{"KOI8-R Cyrillic", "koi8-r"},
{"Chinese", "euc-cn"},
{"Japanese (EUC)", "euc-jp"},
{"Japanese (SJIS)", "shift_jis"},
{"Korean", "euc-kr"},
{"Taipei (Big5)", "big5"},
{"Vietnamese (VISCII)", "viscii"},
{"7 bit approximations","us-ascii"},
{"Transparent", "x-transparent"},
{"DosLatinUS (cp437)", "cp437"},
{"IBM PC character set","cp437"},
{"DosLatin1 (cp850)", "cp850"},
{"IBM PC codepage 850", "cp850"},
{"DosLatin2 (cp852)", "cp852"},
{"PC Latin2 CP 852", "cp852"},
{"DosCyrillic (cp866)", "cp866"},
{"DosArabic (cp864)", "cp864"},
{"DosGreek (cp737)", "cp737"},
{"DosBaltRim (cp775)", "cp775"},
{"DosGreek2 (cp869)", "cp869"},
{"DosHebrew (cp862)", "cp862"},
{"WinLatin2 (cp1250)", "windows-1250"},
{"WinCyrillic (cp1251)","windows-1251"},
{"WinGreek (cp1253)", "windows-1253"},
{"WinHebrew (cp1255)", "windows-1255"},
{"WinArabic (cp1256)", "windows-1256"},
{"WinBaltRim (cp1257)", "windows-1257"},
{"ISO Latin 3", "iso-8859-3"},
{"ISO Latin 4", "iso-8859-4"},
{"ISO 8859-5 Cyrillic", "iso-8859-5"},
{"ISO 8859-6 Arabic", "iso-8859-6"},
{"ISO 8859-7 Greek", "iso-8859-7"},
{"ISO 8859-8 Hebrew", "iso-8859-8"},
{"ISO 8859-9 (Latin 5)","iso-8859-9"},
{"ISO 8859-10", "iso-8859-10"},
{"UNICODE UTF 8", "utf-8"},
{"RFC 1345 w/o Intro", "mnemonic+ascii+0"},
{"RFC 1345 Mnemonic", "mnemonic"},
{NULL, NULL}, /* terminated with NULL */
};
/*
* lynx 2.8/2.7.2 compatibility code:
* read "character_set" parameter from lynx.cfg and .lynxrc
* in both MIME name and "human-readable" name (old and new style).
* Returns -1 if not recognized.
*/
PUBLIC int UCGetLYhndl_byAnyName ARGS1 (char *, value)
{
int i;
LYTrimTrailing(value);
if (value == NULL) return -1;
/* search by name */
for (i = 0; (i < MAXCHARSETS && LYchar_set_names[i]); i++) {
if (!strcmp(value, LYchar_set_names[i])) {
return i; /* OK */
}
}
/* search by old name from 2.8/2.7.2 version */
for (i = 0; (OLD_charset_names[i].fullname); i++) {
if (!strcmp(value, OLD_charset_names[i].fullname)) {
return UCGetLYhndl_byMIME(OLD_charset_names[i].MIMEname); /* OK */
}
}
return UCGetLYhndl_byMIME(value); /* by MIME */
}
/*
* Entity names -- Ordered by ISO Latin 1 value.
* ---------------------------------------------
* For conversions of DECIMAL escaped entities.
* Must be in order of ascending value.
*/
PRIVATE CONST char * LYEntityNames[] = {
/* NAME DECIMAL VALUE */
"nbsp", /* 160, non breaking space */
"iexcl", /* 161, inverted exclamation mark */
"cent", /* 162, cent sign */
"pound", /* 163, pound sign */
"curren", /* 164, currency sign */
"yen", /* 165, yen sign */
"brvbar", /* 166, broken vertical bar, (brkbar) */
"sect", /* 167, section sign */
"uml", /* 168, spacing dieresis */
"copy", /* 169, copyright sign */
"ordf", /* 170, feminine ordinal indicator */
"laquo", /* 171, angle quotation mark, left */
"not", /* 172, negation sign */
"shy", /* 173, soft hyphen */
"reg", /* 174, circled R registered sign */
"hibar", /* 175, spacing macron */
"deg", /* 176, degree sign */
"plusmn", /* 177, plus-or-minus sign */
"sup2", /* 178, superscript 2 */
"sup3", /* 179, superscript 3 */
"acute", /* 180, spacing acute (96) */
"micro", /* 181, micro sign */
"para", /* 182, paragraph sign */
"middot", /* 183, middle dot */
"cedil", /* 184, spacing cedilla */
"sup1", /* 185, superscript 1 */
"ordm", /* 186, masculine ordinal indicator */
"raquo", /* 187, angle quotation mark, right */
"frac14", /* 188, fraction 1/4 */
"frac12", /* 189, fraction 1/2 */
"frac34", /* 190, fraction 3/4 */
"iquest", /* 191, inverted question mark */
"Agrave", /* 192, capital A, grave accent */
"Aacute", /* 193, capital A, acute accent */
"Acirc", /* 194, capital A, circumflex accent */
"Atilde", /* 195, capital A, tilde */
"Auml", /* 196, capital A, dieresis or umlaut mark */
"Aring", /* 197, capital A, ring */
"AElig", /* 198, capital AE diphthong (ligature) */
"Ccedil", /* 199, capital C, cedilla */
"Egrave", /* 200, capital E, grave accent */
"Eacute", /* 201, capital E, acute accent */
"Ecirc", /* 202, capital E, circumflex accent */
"Euml", /* 203, capital E, dieresis or umlaut mark */
"Igrave", /* 204, capital I, grave accent */
"Iacute", /* 205, capital I, acute accent */
"Icirc", /* 206, capital I, circumflex accent */
"Iuml", /* 207, capital I, dieresis or umlaut mark */
"ETH", /* 208, capital Eth, Icelandic (or Latin2 Dstrok) */
"Ntilde", /* 209, capital N, tilde */
"Ograve", /* 210, capital O, grave accent */
"Oacute", /* 211, capital O, acute accent */
"Ocirc", /* 212, capital O, circumflex accent */
"Otilde", /* 213, capital O, tilde */
"Ouml", /* 214, capital O, dieresis or umlaut mark */
"times", /* 215, multiplication sign */
"Oslash", /* 216, capital O, slash */
"Ugrave", /* 217, capital U, grave accent */
"Uacute", /* 218, capital U, acute accent */
"Ucirc", /* 219, capital U, circumflex accent */
"Uuml", /* 220, capital U, dieresis or umlaut mark */
"Yacute", /* 221, capital Y, acute accent */
"THORN", /* 222, capital THORN, Icelandic */
"szlig", /* 223, small sharp s, German (sz ligature) */
"agrave", /* 224, small a, grave accent */
"aacute", /* 225, small a, acute accent */
"acirc", /* 226, small a, circumflex accent */
"atilde", /* 227, small a, tilde */
"auml", /* 228, small a, dieresis or umlaut mark */
"aring", /* 229, small a, ring */
"aelig", /* 230, small ae diphthong (ligature) */
"ccedil", /* 231, small c, cedilla */
"egrave", /* 232, small e, grave accent */
"eacute", /* 233, small e, acute accent */
"ecirc", /* 234, small e, circumflex accent */
"euml", /* 235, small e, dieresis or umlaut mark */
"igrave", /* 236, small i, grave accent */
"iacute", /* 237, small i, acute accent */
"icirc", /* 238, small i, circumflex accent */
"iuml", /* 239, small i, dieresis or umlaut mark */
"eth", /* 240, small eth, Icelandic */
"ntilde", /* 241, small n, tilde */
"ograve", /* 242, small o, grave accent */
"oacute", /* 243, small o, acute accent */
"ocirc", /* 244, small o, circumflex accent */
"otilde", /* 245, small o, tilde */
"ouml", /* 246, small o, dieresis or umlaut mark */
"divide", /* 247, division sign */
"oslash", /* 248, small o, slash */
"ugrave", /* 249, small u, grave accent */
"uacute", /* 250, small u, acute accent */
"ucirc", /* 251, small u, circumflex accent */
"uuml", /* 252, small u, dieresis or umlaut mark */
"yacute", /* 253, small y, acute accent */
"thorn", /* 254, small thorn, Icelandic */
"yuml", /* 255, small y, dieresis or umlaut mark */
};
/*
* Function to return the entity names of
* ISO-8859-1 8-bit characters. - FM
*/
PUBLIC CONST char * HTMLGetEntityName ARGS1(
UCode_t, code)
{
#define IntValue code
int MaxValue = ((sizeof(LYEntityNames)/sizeof(char **)) - 1);
if (IntValue < 0 || IntValue > MaxValue) {
return "";
}
return LYEntityNames[IntValue];
}
/*
* Function to return the UCode_t (long int) value for entity names
* in the ISO_Latin1 and UC_entity_info unicode_entities arrays.
* It returns 0 if not found. - FM
*
* unicode_entities[] handles all the names from old style entities[] too.
* Lynx now calls unicode_entities[] only through this function:
* HTMLGetEntityUCValue(). Note, we need not check for special characters
* here in function or even before it, we should check them *after*
* invoking this function, see put_special_unicodes() in SGML.c.
*
* In the future we will try to isolate all calls to entities[]
* in favor of new unicode-based chartrans scheme. - LP
*/
PUBLIC UCode_t HTMLGetEntityUCValue ARGS1(
CONST char *, name)
{
UCode_t value = 0;
size_t i, high, low;
int diff = 0;
CONST UC_entity_info * unicode_entities = HTML_dtd.unicode_entity_info;
/*
* Make sure we have a non-zero length name. - FM
*/
if (!(name && *name))
return(value);
/*
* Try UC_entity_info unicode_entities[].
*/
#ifdef NOT_ASCII /* S/390 -- gil -- 1656 */
for (i = 0; i < HTML_dtd.number_of_unicode_entities; i++ ) {
/*
** Linear search for NOT_ASCII.
*/
#else /* NOT_ASCII */
for (low = 0, high = HTML_dtd.number_of_unicode_entities;
high > low;
diff < 0 ? (low = i+1) : (high = i)) {
/*
** Binary search.
*/
i = (low + (high-low)/2);
#endif /* NOT_ASCII S/390 -- gil -- 1662 */
diff = strcmp(unicode_entities[i].name, name); /* Case sensitive! */
if (diff == 0) {
value = unicode_entities[i].code;
break;
}
}
/* } NOT_ASCII */
return(value);
}
/*
* Function to select a character set and then set the
* character handling and LYHaveCJKCharacterSet flag. - FM
*/
PUBLIC void HTMLUseCharacterSet ARGS1(int, i)
{
HTMLSetRawModeDefault(i);
p_entity_values = LYCharSets[i];
HTMLSetCharacterHandling(i); /* set LYRawMode and CJK attributes */
HTMLSetHaveCJKCharacterSet(i);
HTMLSetDisplayCharsetMatchLocale(i);
return;
}
/*
* Initializer, calls initialization function for the
* CHARTRANS handling. - KW
*/
PUBLIC int LYCharSetsDeclared NOPARAMS
{
UCInit();
return UCInitialized;
}