/*
* $LynxId: LYCharSets.c,v 1.68 2013/01/04 21:47:16 tom Exp $
*/
#include <HTUtils.h>
#include <HTCJK.h>
#include <HTMLDTD.h>
#include <LYGlobalDefs.h>
#include <UCMap.h>
#include <UCdomap.h>
#include <UCDefs.h>
#include <LYCharSets.h>
#include <GridText.h>
#include <LYCurses.h>
#include <LYStrings.h>
#include <LYLeaks.h>
HTkcode kanji_code = NOKANJI;
BOOLEAN LYHaveCJKCharacterSet = FALSE;
BOOLEAN DisplayCharsetMatchLocale = TRUE;
BOOL force_old_UCLYhndl_on_reload = FALSE;
int forced_UCLYhdnl;
int LYNumCharsets = 0; /* Will be initialized later by UC_Register. */
int current_char_set = -1; /* will be intitialized later in LYMain.c */
int linedrawing_char_set = -1;
STRING2PTR p_entity_values = NULL; /* Pointer, for HTML_put_entity() */
/* obsolete and probably not used(???) */
/* will be initialized in HTMLUseCharacterSet */
#ifdef USE_CHARSET_CHOICE
charset_subset_t charset_subsets[MAXCHARSETS];
BOOL custom_display_charset = FALSE;
BOOL custom_assumed_doc_charset = FALSE;
#ifndef ALL_CHARSETS_IN_O_MENU_SCREEN
int display_charset_map[MAXCHARSETS];
int assumed_doc_charset_map[MAXCHARSETS];
const char *display_charset_choices[MAXCHARSETS + 1];
const char *assumed_charset_choices[MAXCHARSETS + 1];
int displayed_display_charset_idx;
#endif
#endif /* USE_CHARSET_CHOICE */
/*
* New character sets now declared with UCInit() in UCdomap.c
*
* INSTRUCTIONS for adding new character sets which do not have
* Unicode tables now in UCdomap.h
*
*
* [We hope you need not correct/add old-style mapping below as in ISO_LATIN1[]
* or SevenBitApproximations[] any more - it works now via new chartrans
* mechanism, but kept for compatibility only: we should cleanup the stuff,
* but this is not so easy...]
*
* Currently we only declare some charset's properties here (such as MIME
* names, etc.), it does not include real mapping.
*
* There is a place marked "Add your new character sets HERE" in this file.
* Make up a character set and add it in the same style as the ISO_LATIN1 set
* below, giving it a unique name.
*
* Add the name of the set to LYCharSets. Similarly add the appropriate
* information to the tables below: LYchar_set_names, LYCharSet_UC,
* LYlowest_eightbit. These 4 tables all MUST have the same order. (And this
* is the order you will see in Lynx Options Menu, which is why few
* unicode-based charsets are listed here).
*
*/
/* Entity values -- for ISO Latin 1 local representation
*
* This MUST match exactly the table referred to in the DTD!
*/
static const char *ISO_Latin1[] =
{
"\306", /* capital AE diphthong (ligature) (Æ) - AElig */
"\301", /* capital A, acute accent (Á) - Aacute */
"\302", /* capital A, circumflex accent (Â) - Acirc */
"\300", /* capital A, grave accent (À) - Agrave */
"\305", /* capital A, ring - Aring (Å) */
"\303", /* capital A, tilde - Atilde (Ã) */
"\304", /* capital A, dieresis or umlaut mark (Ä) - Auml */
"\307", /* capital C, cedilla - Ccedil (Ç) */
"\320", /* capital Eth or D with stroke (Ð) - Dstrok */
"\320", /* capital Eth, Icelandic (Ð) - ETH */
"\311", /* capital E, acute accent (É) - Eacute */
"\312", /* capital E, circumflex accent (Ê) - Ecirc */
"\310", /* capital E, grave accent (È) - Egrave */
"\313", /* capital E, dieresis or umlaut mark (Ë) - Euml */
"\315", /* capital I, acute accent (Í) - Iacute */
"\316", /* capital I, circumflex accent (Î) - Icirc */
"\314", /* capital I, grave accent (Ì) - Igrave */
"\317", /* capital I, dieresis or umlaut mark (Ï) - Iuml */
"\321", /* capital N, tilde (Ñ) - Ntilde */
"\323", /* capital O, acute accent (Ó) - Oacute */
"\324", /* capital O, circumflex accent (Ô) - Ocirc */
"\322", /* capital O, grave accent (Ò) - Ograve */
"\330", /* capital O, slash (Ø) - Oslash */
"\325", /* capital O, tilde (Õ) - Otilde */
"\326", /* capital O, dieresis or umlaut mark (Ö) - Ouml */
"\336", /* capital THORN, Icelandic (Þ) - THORN */
"\332", /* capital U, acute accent (Ú) - Uacute */
"\333", /* capital U, circumflex accent (Û) - Ucirc */
"\331", /* capital U, grave accent (Ù) - Ugrave */
"\334", /* capital U, dieresis or umlaut mark (Ü) - Uuml */
"\335", /* capital Y, acute accent (Ý) - Yacute */
"\341", /* small a, acute accent (á) - aacute */
"\342", /* small a, circumflex accent (â) - acirc */
"\264", /* spacing acute (´) - acute */
"\346", /* small ae diphthong (ligature) (æ) - aelig */
"\340", /* small a, grave accent (à) - agrave */
"\046", /* ampersand (&) - amp */
"\345", /* small a, ring (å) - aring */
"\343", /* small a, tilde (ã) - atilde */
"\344", /* small a, dieresis or umlaut mark (ä) - auml */
"\246", /* broken vertical bar (¦) - brkbar */
"\246", /* broken vertical bar (¦) - brvbar */
"\347", /* small c, cedilla (ç) - ccedil */
"\270", /* spacing cedilla (¸) - cedil */
"\242", /* cent sign (¢) - cent */
"\251", /* copyright sign (©) - copy */
"\244", /* currency sign (¤) - curren */
"\260", /* degree sign (°) - deg */
"\250", /* spacing dieresis (¨) - die */
"\367", /* division sign (÷) - divide */
"\351", /* small e, acute accent (é) - eacute */
"\352", /* small e, circumflex accent (ê) - ecirc */
"\350", /* small e, grave accent (è) - egrave */
"-", /* dash the width of emsp - emdash */
"\002", /* emsp, em space - not collapsed NEVER CHANGE THIS - emsp */
"-", /* dash the width of ensp - endash */
"\002", /* ensp, en space - not collapsed NEVER CHANGE THIS - ensp */
"\360", /* small eth, Icelandic (ð) - eth */
"\353", /* small e, dieresis or umlaut mark (ë) - euml */
"\275", /* fraction 1/2 (½) - frac12 */
"\274", /* fraction 1/4 (¼) - frac14 */
"\276", /* fraction 3/4 (¾) - frac34 */
"\076", /* greater than (>) - gt */
"\257", /* spacing macron (¯) - hibar */
"\355", /* small i, acute accent (í) - iacute */
"\356", /* small i, circumflex accent (î) - icirc */
"\241", /* inverted exclamation mark (¡) - iexcl */
"\354", /* small i, grave accent (ì) - igrave */
"\277", /* inverted question mark (¿) - iquest */
"\357", /* small i, dieresis or umlaut mark (ï) - iuml */
"\253", /* angle quotation mark, left («) - laquo */
"\074", /* less than (<) - lt */
"\257", /* spacing macron (¯) - macr */
"-", /* dash the width of emsp - mdash */
"\265", /* micro sign (µ) - micro */
"\267", /* middle dot (·) - middot */
"\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
"-", /* dash the width of ensp - ndash */
"\254", /* negation sign (¬) - not */
"\361", /* small n, tilde (ñ) - ntilde */
"\363", /* small o, acute accent (ó) - oacute */
"\364", /* small o, circumflex accent (ô) - ocirc */
"\362", /* small o, grave accent (ò) - ograve */
"\252", /* feminine ordinal indicator (ª) - ordf */
"\272", /* masculine ordinal indicator (º) - ordm */
"\370", /* small o, slash (ø) - oslash */
"\365", /* small o, tilde (õ) - otilde */
"\366", /* small o, dieresis or umlaut mark (ö) - ouml */
"\266", /* paragraph sign (¶) - para */
"\261", /* plus-or-minus sign (±) - plusmn */
"\243", /* pound sign (£) - pound */
"\042", /* quote '"' (") - quot */
"\273", /* angle quotation mark, right (») - raquo */
"\256", /* circled R registered sign (®) - reg */
"\247", /* section sign (§) - sect */
"\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */
"\271", /* superscript 1 (¹) - sup1 */
"\262", /* superscript 2 (²) - sup2 */
"\263", /* superscript 3 (³) - sup3 */
"\337", /* small sharp s, German (sz ligature) (ß) - szlig */
"\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */
"\376", /* small thorn, Icelandic (þ) - thorn */
"\327", /* multiplication sign (×) - times */
"(TM)", /* circled TM trade mark sign (™) - trade */
"\372", /* small u, acute accent (ú) - uacute */
"\373", /* small u, circumflex accent (û) - ucirc */
"\371", /* small u, grave accent (ù) - ugrave */
"\250", /* spacing dieresis (¨) - uml */
"\374", /* small u, dieresis or umlaut mark (ü) - uuml */
"\375", /* small y, acute accent (ý) - yacute */
"\245", /* yen sign (¥) - yen */
"\377", /* small y, dieresis or umlaut mark (ÿ) - yuml */
};
/* Entity values -- 7 bit character approximations
*
* This MUST match exactly the table referred to in the DTD!
*/
const char *SevenBitApproximations[] =
{
"AE", /* capital AE diphthong (ligature) (Æ) - AElig */
"A", /* capital A, acute accent (Á) - Aacute */
"A", /* capital A, circumflex accent (Â) - Acirc */
"A", /* capital A, grave accent (À) - Agrave */
"A", /* capital A, ring - Aring (Å) */
"A", /* capital A, tilde - Atilde (Ã) */
#ifdef LY_UMLAUT
"Ae", /* capital A, dieresis or umlaut mark (Ä) - Auml */
#else
"A", /* capital A, dieresis or umlaut mark (Ä) - Auml */
#endif /* LY_UMLAUT */
"C", /* capital C, cedilla (Ç) - Ccedil */
"Dj", /* capital D with stroke (Ð) - Dstrok */
"DH", /* capital Eth, Icelandic (Ð) - ETH */
"E", /* capital E, acute accent (É) - Eacute */
"E", /* capital E, circumflex accent (Ê) - Ecirc */
"E", /* capital E, grave accent (È) - Egrave */
"E", /* capital E, dieresis or umlaut mark (Ë) - Euml */
"I", /* capital I, acute accent (Í) - Iacute */
"I", /* capital I, circumflex accent (Î) - Icirc */
"I", /* capital I, grave accent (Ì) - Igrave */
"I", /* capital I, dieresis or umlaut mark (Ï) - Iuml */
"N", /* capital N, tilde - Ntilde (Ñ) */
"O", /* capital O, acute accent (Ó) - Oacute */
"O", /* capital O, circumflex accent (Ô) - Ocirc */
"O", /* capital O, grave accent (Ò) - Ograve */
"O", /* capital O, slash (Ø) - Oslash */
"O", /* capital O, tilde (Õ) - Otilde */
#ifdef LY_UMLAUT
"Oe", /* capital O, dieresis or umlaut mark (Ö) - Ouml */
#else
"O", /* capital O, dieresis or umlaut mark (Ö) - Ouml */
#endif /* LY_UMLAUT */
"P", /* capital THORN, Icelandic (Þ) - THORN */
"U", /* capital U, acute accent (Ú) - Uacute */
"U", /* capital U, circumflex accent (Û) - Ucirc */
"U", /* capital U, grave accent (Ù) - Ugrave */
#ifdef LY_UMLAUT
"Ue", /* capital U, dieresis or umlaut mark (Ü) - Uuml */
#else
"U", /* capital U, dieresis or umlaut mark (Ü) - Uuml */
#endif /* LY_UMLAUT */
"Y", /* capital Y, acute accent (Ý) - Yacute */
"a", /* small a, acute accent (á) - aacute */
"a", /* small a, circumflex accent (â) - acirc */
"'", /* spacing acute (´) - acute */
"ae", /* small ae diphthong (ligature) (æ) - aelig */
"`a", /* small a, grave accent (è) - agrave */
"&", /* ampersand (&) - amp */
"a", /* small a, ring (å) - aring */
"a", /* small a, tilde (ã) - atilde */
#ifdef LY_UMLAUT
"ae", /* small a, dieresis or umlaut mark (ä) - auml */
#else
"a", /* small a, dieresis or umlaut mark (ä) - auml */
#endif /* LY_UMLAUT */
"|", /* broken vertical bar (¦) - brkbar */
"|", /* broken vertical bar (¦) - brvbar */
"c", /* small c, cedilla (ç) - ccedil */
",", /* spacing cedilla (¸) - cedil */
"-c-", /* cent sign (¢) - cent */
"(c)", /* copyright sign (©) - copy */
"CUR", /* currency sign (¤) - curren */
"DEG", /* degree sign (°) - deg */
"\042", /* spacing dieresis (¨) - die */
"/", /* division sign (÷) - divide */
"e", /* small e, acute accent (é) - eacute */
"e", /* small e, circumflex accent (ê) - ecirc */
"e", /* small e, grave accent (è) - egrave */
"-", /* dash the width of emsp - emdash */
"\002", /* emsp NEVER CHANGE THIS - emsp */
"-", /* dash the width of ensp - endash */
"\002", /* ensp NEVER CHANGE THIS - ensp */
"dh", /* small eth, Icelandic eth (ð) */
"e", /* small e, dieresis or umlaut mark (ë) - euml */
" 1/2", /* fraction 1/2 (½) - frac12 */
" 1/4", /* fraction 1/4 (¼) - frac14 */
" 3/4", /* fraction 3/4 (¾) - frac34 */
">", /* greater than (>) - gt */
"-", /* spacing macron (¯) - hibar */
"i", /* small i, acute accent (í) - iacute */
"i", /* small i, circumflex accent (î) - icirc */
"!", /* inverted exclamation mark (¡) - iexcl */
"`i", /* small i, grave accent (ì) - igrave */
"?", /* inverted question mark (¿) - iquest */
"i", /* small i, dieresis or umlaut mark (ï) - iuml */
"<<", /* angle quotation mark, left («) - laquo */
"<", /* less than - lt (<) */
"-", /* spacing macron (¯) - macr */
"-", /* dash the width of emsp - mdash */
"u", /* micro sign (µ) - micro */
".", /* middle dot (·) - middot */
"\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
"-", /* dash the width of ensp - ndash */
"NOT", /* negation sign (¬) - not */
"n", /* small n, tilde (ñ) - ntilde */
"o", /* small o, acute accent (ó) - oacute */
"o", /* small o, circumflex accent (ô) - ocirc */
"o", /* small o, grave accent (ò) - ograve */
"-a", /* feminine ordinal indicator (ª) - ordf */
"-o", /* masculine ordinal indicator (º) - ordm */
"o", /* small o, slash (ø) - oslash */
"o", /* small o, tilde (õ) - otilde */
#ifdef LY_UMLAUT
"oe", /* small o, dieresis or umlaut mark (ö) - ouml */
#else
"o", /* small o, dieresis or umlaut mark (ö) - ouml */
#endif /* LY_UMLAUT */
"P:", /* paragraph sign (¶) - para */
"+-", /* plus-or-minus sign (±) - plusmn */
"-L-", /* pound sign (£) - pound */
"\"", /* quote '"' (") - quot */
">>", /* angle quotation mark, right (») - raquo */
"(R)", /* circled R registered sign (®) - reg */
"S:", /* section sign (§) - sect */
"\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */
"^1", /* superscript 1 (¹) - sup1 */
"^2", /* superscript 2 (²) - sup2 */
"^3", /* superscript 3 (³) - sup3 */
"ss", /* small sharp s, German (sz ligature) (ß) - szlig */
"\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */
"p", /* small thorn, Icelandic (þ) - thorn */
"*", /* multiplication sign (×) - times */
"(TM)", /* circled TM trade mark sign (™) - trade */
"u", /* small u, acute accent (ú) - uacute */
"u", /* small u, circumflex accent (û) - ucirc */
"u", /* small u, grave accent (ù) - ugrave */
"\042", /* spacing dieresis (¨) - uml */
#ifdef LY_UMLAUT
"ue", /* small u, dieresis or umlaut mark (ü) - uuml */
#else
"u", /* small u, dieresis or umlaut mark (ü) - uuml */
#endif /* LY_UMLAUT */
"y", /* small y, acute accent (ý) - yacute */
"YEN", /* yen sign (¥) - yen */
"y", /* small y, dieresis or umlaut mark (ÿ) - yuml */
};
/*
* Add your new character sets HERE (but only if you can't construct Unicode
* tables for them). - FM
*/
/*
* Add the array name to LYCharSets
*/
STRING2PTR LYCharSets[MAXCHARSETS] =
{
ISO_Latin1, /* ISO Latin 1 */
SevenBitApproximations, /* 7 Bit Approximations */
};
/*
* Add the name that the user will see below. The order of LYCharSets and
* LYchar_set_names MUST be the same
*/
const char *LYchar_set_names[MAXCHARSETS + 1] =
{
"Western (ISO-8859-1)",
"7 bit approximations (US-ASCII)",
(char *) 0
};
/*
* Associate additional pieces of info with each of the charsets listed above.
* Will be automatically modified (and extended) by charset translations which
* are loaded using the chartrans mechanism. Most important piece of info to
* put here is a MIME charset name. Used for chartrans (see UCDefs.h). The
* order of LYCharSets and LYCharSet_UC MUST be the same.
*
* Note that most of the charsets added by the new mechanism in src/chrtrans
* don't show up here at all. They don't have to.
*/
LYUCcharset LYCharSet_UC[MAXCHARSETS] =
{
/*
* Zero position placeholder and HTMLGetEntityUCValue() reference. - FM
*/
{-1, "iso-8859-1", UCT_ENC_8BIT, 0,
UCT_REP_IS_LAT1,
UCT_CP_IS_LAT1, UCT_R_LAT1, UCT_R_LAT1},
/*
* Placeholders for Unicode tables. - FM
*/
{-1, "us-ascii", UCT_ENC_7BIT, 0,
UCT_REP_SUBSETOF_LAT1,
UCT_CP_SUBSETOF_LAT1, UCT_R_ASCII, UCT_R_ASCII},
};
/*
* Add the code of the the lowest character with the high bit set that can be
* directly displayed. The order of LYCharSets and LYlowest_eightbit MUST be
* the same.
*
* (If charset have chartrans unicode table, LYlowest_eightbit will be
* verified/modified anyway.)
*/
int LYlowest_eightbit[MAXCHARSETS] =
{
160, /* ISO Latin 1 */
999, /* 7 bit approximations */
};
/*
* Function to set the handling of selected character sets based on the current
* LYUseDefaultRawMode value. - FM
*/
void HTMLSetCharacterHandling(int i)
{
int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);
BOOLEAN LYRawMode_flag = LYRawMode;
int UCLYhndl_for_unspec_flag = UCLYhndl_for_unspec;
if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {
HTCJK = NOCJK;
kanji_code = NOKANJI;
if (i == chndl)
LYRawMode = LYUseDefaultRawMode;
else
LYRawMode = (BOOL) (!LYUseDefaultRawMode);
HTPassEightBitNum = (BOOL) ((LYCharSet_UC[i].codepoints & UCT_CP_SUPERSETOF_LAT1)
|| (LYCharSet_UC[i].like8859 & UCT_R_HIGH8BIT));
if (LYRawMode) {
HTPassEightBitRaw = (BOOL) (LYlowest_eightbit[i] <= 160);
} else {
HTPassEightBitRaw = FALSE;
}
if (LYRawMode || i == chndl) {
HTPassHighCtrlRaw = (BOOL) (LYlowest_eightbit[i] <= 130);
} else {
HTPassHighCtrlRaw = FALSE;
}
HTPassHighCtrlNum = FALSE;
} else { /* CJK encoding: */
const char *mime = LYCharSet_UC[i].MIMEname;
if (!strcmp(mime, "euc-cn")) {
HTCJK = CHINESE;
kanji_code = EUC;
} else if (!strcmp(mime, "euc-jp")) {
HTCJK = JAPANESE;
kanji_code = EUC;
} else if (!strcmp(mime, "shift_jis")) {
HTCJK = JAPANESE;
kanji_code = SJIS;
} else if (!strcmp(mime, "euc-kr")) {
HTCJK = KOREAN;
kanji_code = EUC;
} else if (!strcmp(mime, "big5")) {
HTCJK = TAIPEI;
kanji_code = EUC;
}
/* for any CJK: */
if (!LYUseDefaultRawMode)
HTCJK = NOCJK;
LYRawMode = (BOOL) (IS_CJK_TTY ? TRUE : FALSE);
HTPassEightBitRaw = FALSE;
HTPassEightBitNum = FALSE;
HTPassHighCtrlRaw = (BOOL) (IS_CJK_TTY ? TRUE : FALSE);
HTPassHighCtrlNum = FALSE;
}
/*
* Comment for coding below:
* UCLYhndl_for_unspec is "current" state with LYRawMode, but
* UCAssume_MIMEcharset is independent from LYRawMode: holds the history
* and may be changed from 'O'ptions menu only. - LP
*/
if (LYRawMode) {
UCLYhndl_for_unspec = i; /* UCAssume_MIMEcharset not changed! */
} else {
if (chndl != i &&
(LYCharSet_UC[i].enc != UCT_ENC_CJK ||
LYCharSet_UC[chndl].enc != UCT_ENC_CJK)) {
UCLYhndl_for_unspec = chndl; /* fall to UCAssume_MIMEcharset */
} else {
UCLYhndl_for_unspec = LATIN1; /* UCAssume_MIMEcharset not changed! */
}
}
#ifdef USE_SLANG
if (LYlowest_eightbit[i] > 191) {
/*
* Higher than this may output cntrl chars to screen. - KW
*/
SLsmg_Display_Eight_Bit = 191;
} else {
SLsmg_Display_Eight_Bit = LYlowest_eightbit[i];
}
#endif /* USE_SLANG */
ena_csi(LYlowest_eightbit[current_char_set] > 155);
/* some diagnostics */
if (TRACE) {
if (LYRawMode_flag != LYRawMode)
CTRACE((tfp,
"HTMLSetCharacterHandling: LYRawMode changed %s -> %s\n",
(LYRawMode_flag ? "ON" : "OFF"),
(LYRawMode ? "ON" : "OFF")));
if (UCLYhndl_for_unspec_flag != UCLYhndl_for_unspec)
CTRACE((tfp,
"HTMLSetCharacterHandling: UCLYhndl_for_unspec changed %d -> %d\n",
UCLYhndl_for_unspec_flag,
UCLYhndl_for_unspec));
}
return;
}
/*
* Function to set HTCJK based on "in" and "out" charsets.
*/
void Set_HTCJK(const char *inMIMEname,
const char *outMIMEname)
{
/* need not check for synonyms: MIMEname's got from LYCharSet_UC */
if (LYRawMode) {
if ((!strcmp(inMIMEname, "euc-jp") ||
#ifdef EXP_JAPANESEUTF8_SUPPORT
!strcmp(inMIMEname, "utf-8") ||
#endif
!strcmp(inMIMEname, "shift_jis")) &&
(!strcmp(outMIMEname, "euc-jp") ||
!strcmp(outMIMEname, "shift_jis"))) {
HTCJK = JAPANESE;
} else if (!strcmp(inMIMEname, "euc-cn") &&
!strcmp(outMIMEname, "euc-cn")) {
HTCJK = CHINESE;
} else if (!strcmp(inMIMEname, "big5") &&
!strcmp(outMIMEname, "big5")) {
HTCJK = TAIPEI;
} else if (!strcmp(inMIMEname, "euc-kr") &&
!strcmp(outMIMEname, "euc-kr")) {
HTCJK = KOREAN;
} else {
HTCJK = NOCJK;
}
} else {
HTCJK = NOCJK;
}
}
/*
* Function to set the LYDefaultRawMode value based on the selected character
* set. - FM
*
* Currently unused: the default value so obvious that LYUseDefaultRawMode
* utilized directly by someone's mistake. - LP
*/
static void HTMLSetRawModeDefault(int i)
{
LYDefaultRawMode = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK);
return;
}
/*
* Function to set the LYUseDefaultRawMode value based on the selected
* character set and the current LYRawMode value. - FM
*/
void HTMLSetUseDefaultRawMode(int i,
int modeflag)
{
if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {
int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);
if (i == chndl)
LYUseDefaultRawMode = (BOOLEAN) modeflag;
else
LYUseDefaultRawMode = (BOOL) (!modeflag);
} else /* CJK encoding: */
LYUseDefaultRawMode = (BOOLEAN) modeflag;
return;
}
/*
* Function to set the LYHaveCJKCharacterSet value based on the selected
* character set. - FM
*/
static void HTMLSetHaveCJKCharacterSet(int i)
{
LYHaveCJKCharacterSet = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK);
return;
}
/*
* Function to set the DisplayCharsetMatchLocale value based on the selected
* character set. It is used in UPPER8 for 8bit case-insensitive search by
* matching def7_uni.tbl images. - LP
*/
static void HTMLSetDisplayCharsetMatchLocale(int i)
{
BOOLEAN match;
if (LYHaveCJKCharacterSet) {
/*
* We have no intention to pass CJK via UCTransChar if that happened.
* Let someone from CJK correct this if necessary.
*/
DisplayCharsetMatchLocale = TRUE; /* old-style */
return;
} else if (strncasecomp(LYCharSet_UC[i].MIMEname, "cp", 2) ||
strncasecomp(LYCharSet_UC[i].MIMEname, "windows", 7)) {
/*
* Assume dos/windows displays usually on remote terminal, hence it
* rarely matches locale. (In fact, MS Windows codepoints locale are
* never seen on UNIX).
*/
match = FALSE;
} else {
match = TRUE; /* guess, but see below */
#if !defined(LOCALE)
if (LYCharSet_UC[i].enc != UCT_ENC_UTF8)
/*
* Leave true for utf-8 display - the code doesn't deal very well
* with this case. - kw
*/
match = FALSE;
#else
if (UCForce8bitTOUPPER) {
/*
* Force disable locale (from lynx.cfg)
*/
match = FALSE;
}
#endif
}
DisplayCharsetMatchLocale = match;
return;
}
/*
* lynx 2.8/2.7.2(and more early) compatibility code: "human-readable" charset
* names changes with time so we map that history names to MIME here to get old
* lynx.cfg and (especially) .lynxrc always recognized. Please update this
* table when you change "fullname" of any present charset.
*/
typedef struct _names_pairs {
const char *fullname;
const char *MIMEname;
} names_pairs;
/* *INDENT-OFF* */
static const names_pairs OLD_charset_names[] =
{
{"ISO Latin 1", "iso-8859-1"},
{"ISO Latin 2", "iso-8859-2"},
{"WinLatin1 (cp1252)", "windows-1252"},
{"DEC Multinational", "dec-mcs"},
{"Macintosh (8 bit)", "macintosh"},
{"NeXT character set", "next"},
{"KOI8-R Cyrillic", "koi8-r"},
{"Chinese", "euc-cn"},
{"Japanese (EUC)", "euc-jp"},
{"Japanese (SJIS)", "shift_jis"},
{"Korean", "euc-kr"},
{"Taipei (Big5)", "big5"},
{"Vietnamese (VISCII)", "viscii"},
{"7 bit approximations", "us-ascii"},
{"Transparent", "x-transparent"},
{"DosLatinUS (cp437)", "cp437"},
{"IBM PC character set", "cp437"},
{"DosLatin1 (cp850)", "cp850"},
{"IBM PC codepage 850", "cp850"},
{"DosLatin2 (cp852)", "cp852"},
{"PC Latin2 CP 852", "cp852"},
{"DosCyrillic (cp866)", "cp866"},
{"DosArabic (cp864)", "cp864"},
{"DosGreek (cp737)", "cp737"},
{"DosBaltRim (cp775)", "cp775"},
{"DosGreek2 (cp869)", "cp869"},
{"DosHebrew (cp862)", "cp862"},
{"WinLatin2 (cp1250)", "windows-1250"},
{"WinCyrillic (cp1251)", "windows-1251"},
{"WinGreek (cp1253)", "windows-1253"},
{"WinHebrew (cp1255)", "windows-1255"},
{"WinArabic (cp1256)", "windows-1256"},
{"WinBaltRim (cp1257)", "windows-1257"},
{"ISO Latin 3", "iso-8859-3"},
{"ISO Latin 4", "iso-8859-4"},
{"ISO 8859-5 Cyrillic", "iso-8859-5"},
{"ISO 8859-6 Arabic", "iso-8859-6"},
{"ISO 8859-7 Greek", "iso-8859-7"},
{"ISO 8859-8 Hebrew", "iso-8859-8"},
{"ISO-8859-8-I", "iso-8859-8"},
{"ISO-8859-8-E", "iso-8859-8"},
{"ISO 8859-9 (Latin 5)", "iso-8859-9"},
{"ISO 8859-10", "iso-8859-10"},
{"UNICODE UTF 8", "utf-8"},
{"RFC 1345 w/o Intro", "mnemonic+ascii+0"},
{"RFC 1345 Mnemonic", "mnemonic"},
{NULL, NULL}, /* terminated with NULL */
};
/* *INDENT-ON* */
/*
* lynx 2.8/2.7.2 compatibility code: read "character_set" parameter from
* lynx.cfg and .lynxrc in both MIME name and "human-readable" name (old and
* new style). Returns -1 if not recognized.
*/
int UCGetLYhndl_byAnyName(char *value)
{
int i;
if (value == NULL)
return -1;
LYTrimTrailing(value);
CTRACE((tfp, "UCGetLYhndl_byAnyName(%s)\n", value));
/* search by name */
for (i = 0; (i < MAXCHARSETS && LYchar_set_names[i]); i++) {
if (!strcmp(value, LYchar_set_names[i])) {
return i; /* OK */
}
}
/* search by old name from 2.8/2.7.2 version */
for (i = 0; (OLD_charset_names[i].fullname); i++) {
if (!strcmp(value, OLD_charset_names[i].fullname)) {
return UCGetLYhndl_byMIME(OLD_charset_names[i].MIMEname); /* OK */
}
}
return UCGetLYhndl_byMIME(value); /* by MIME */
}
/*
* Entity names -- Ordered by ISO Latin 1 value.
* ---------------------------------------------
* For conversions of DECIMAL escaped entities.
* Must be in order of ascending value.
*/
static const char *LYEntityNames[] =
{
/* NAME DECIMAL VALUE */
"nbsp", /* 160, non breaking space */
"iexcl", /* 161, inverted exclamation mark */
"cent", /* 162, cent sign */
"pound", /* 163, pound sign */
"curren", /* 164, currency sign */
"yen", /* 165, yen sign */
"brvbar", /* 166, broken vertical bar, (brkbar) */
"sect", /* 167, section sign */
"uml", /* 168, spacing dieresis */
"copy", /* 169, copyright sign */
"ordf", /* 170, feminine ordinal indicator */
"laquo", /* 171, angle quotation mark, left */
"not", /* 172, negation sign */
"shy", /* 173, soft hyphen */
"reg", /* 174, circled R registered sign */
"hibar", /* 175, spacing macron */
"deg", /* 176, degree sign */
"plusmn", /* 177, plus-or-minus sign */
"sup2", /* 178, superscript 2 */
"sup3", /* 179, superscript 3 */
"acute", /* 180, spacing acute (96) */
"micro", /* 181, micro sign */
"para", /* 182, paragraph sign */
"middot", /* 183, middle dot */
"cedil", /* 184, spacing cedilla */
"sup1", /* 185, superscript 1 */
"ordm", /* 186, masculine ordinal indicator */
"raquo", /* 187, angle quotation mark, right */
"frac14", /* 188, fraction 1/4 */
"frac12", /* 189, fraction 1/2 */
"frac34", /* 190, fraction 3/4 */
"iquest", /* 191, inverted question mark */
"Agrave", /* 192, capital A, grave accent */
"Aacute", /* 193, capital A, acute accent */
"Acirc", /* 194, capital A, circumflex accent */
"Atilde", /* 195, capital A, tilde */
"Auml", /* 196, capital A, dieresis or umlaut mark */
"Aring", /* 197, capital A, ring */
"AElig", /* 198, capital AE diphthong (ligature) */
"Ccedil", /* 199, capital C, cedilla */
"Egrave", /* 200, capital E, grave accent */
"Eacute", /* 201, capital E, acute accent */
"Ecirc", /* 202, capital E, circumflex accent */
"Euml", /* 203, capital E, dieresis or umlaut mark */
"Igrave", /* 204, capital I, grave accent */
"Iacute", /* 205, capital I, acute accent */
"Icirc", /* 206, capital I, circumflex accent */
"Iuml", /* 207, capital I, dieresis or umlaut mark */
"ETH", /* 208, capital Eth, Icelandic (or Latin2 Dstrok) */
"Ntilde", /* 209, capital N, tilde */
"Ograve", /* 210, capital O, grave accent */
"Oacute", /* 211, capital O, acute accent */
"Ocirc", /* 212, capital O, circumflex accent */
"Otilde", /* 213, capital O, tilde */
"Ouml", /* 214, capital O, dieresis or umlaut mark */
"times", /* 215, multiplication sign */
"Oslash", /* 216, capital O, slash */
"Ugrave", /* 217, capital U, grave accent */
"Uacute", /* 218, capital U, acute accent */
"Ucirc", /* 219, capital U, circumflex accent */
"Uuml", /* 220, capital U, dieresis or umlaut mark */
"Yacute", /* 221, capital Y, acute accent */
"THORN", /* 222, capital THORN, Icelandic */
"szlig", /* 223, small sharp s, German (sz ligature) */
"agrave", /* 224, small a, grave accent */
"aacute", /* 225, small a, acute accent */
"acirc", /* 226, small a, circumflex accent */
"atilde", /* 227, small a, tilde */
"auml", /* 228, small a, dieresis or umlaut mark */
"aring", /* 229, small a, ring */
"aelig", /* 230, small ae diphthong (ligature) */
"ccedil", /* 231, small c, cedilla */
"egrave", /* 232, small e, grave accent */
"eacute", /* 233, small e, acute accent */
"ecirc", /* 234, small e, circumflex accent */
"euml", /* 235, small e, dieresis or umlaut mark */
"igrave", /* 236, small i, grave accent */
"iacute", /* 237, small i, acute accent */
"icirc", /* 238, small i, circumflex accent */
"iuml", /* 239, small i, dieresis or umlaut mark */
"eth", /* 240, small eth, Icelandic */
"ntilde", /* 241, small n, tilde */
"ograve", /* 242, small o, grave accent */
"oacute", /* 243, small o, acute accent */
"ocirc", /* 244, small o, circumflex accent */
"otilde", /* 245, small o, tilde */
"ouml", /* 246, small o, dieresis or umlaut mark */
"divide", /* 247, division sign */
"oslash", /* 248, small o, slash */
"ugrave", /* 249, small u, grave accent */
"uacute", /* 250, small u, acute accent */
"ucirc", /* 251, small u, circumflex accent */
"uuml", /* 252, small u, dieresis or umlaut mark */
"yacute", /* 253, small y, acute accent */
"thorn", /* 254, small thorn, Icelandic */
"yuml", /* 255, small y, dieresis or umlaut mark */
};
/*
* Function to return the entity names of ISO-8859-1 8-bit characters. - FM
*/
const char *HTMLGetEntityName(UCode_t code)
{
#define IntValue code
int MaxValue = (TABLESIZE(LYEntityNames) - 1);
if (IntValue < 0 || IntValue > MaxValue) {
return "";
}
return LYEntityNames[IntValue];
}
/*
* Function to return the UCode_t (long int) value for entity names. It
* returns 0 if not found.
*
* unicode_entities[] handles all the names from old style entities[] too.
* Lynx now calls unicode_entities[] only through this function:
* HTMLGetEntityUCValue(). Note, we need not check for special characters here
* in function or even before it, we should check them *after* invoking this
* function, see put_special_unicodes() in SGML.c.
*
* In the future we will try to isolate all calls to entities[] in favor of new
* unicode-based chartrans scheme. - LP
*/
UCode_t HTMLGetEntityUCValue(const char *name)
{
#include <entities.h>
UCode_t value = 0;
size_t i, high, low;
int diff = 0;
size_t number_of_unicode_entities = TABLESIZE(unicode_entities);
/*
* Make sure we have a non-zero length name. - FM
*/
if (isEmpty(name))
return (value);
/*
* Try UC_entity_info unicode_entities[].
*/
for (low = 0, high = number_of_unicode_entities;
high > low;
diff < 0 ? (low = i + 1) : (high = i)) {
/*
* Binary search.
*/
i = (low + (high - low) / 2);
diff = AS_cmp(unicode_entities[i].name, name); /* Case sensitive! */
if (diff == 0) {
value = unicode_entities[i].code;
break;
}
}
return (value);
}
/*
* Original comment -
* Assume these are Microsoft code points, inflicted on us by FrontPage. - FM
*
* MS FrontPage uses syntax like ™ in 128-159 range and doesn't follow
* Unicode standards for this area. Windows-1252 codepoints are assumed here.
*
* However see -
* http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0
*/
UCode_t LYcp1252ToUnicode(UCode_t code)
{
if ((code == 1) ||
(code > 127 && code < 160)) {
switch (code) {
case 1:
/*
* WHITE SMILING FACE
*/
code = 0x263a;
break;
case 128:
/*
* EURO currency sign
*/
code = 0x20ac;
break;
case 130:
/*
* SINGLE LOW-9 QUOTATION MARK (sbquo)
*/
code = 0x201a;
break;
case 131:
/*
* LATIN SMALL LETTER F WITH HOOK
*/
code = 0x192;
break;
case 132:
/*
* DOUBLE LOW-9 QUOTATION MARK (bdquo)
*/
code = 0x201e;
break;
case 133:
/*
* HORIZONTAL ELLIPSIS (hellip)
*/
code = 0x2026;
break;
case 134:
/*
* DAGGER (dagger)
*/
code = 0x2020;
break;
case 135:
/*
* DOUBLE DAGGER (Dagger)
*/
code = 0x2021;
break;
case 136:
/*
* MODIFIER LETTER CIRCUMFLEX ACCENT
*/
code = 0x2c6;
break;
case 137:
/*
* PER MILLE SIGN (permil)
*/
code = 0x2030;
break;
case 138:
/*
* LATIN CAPITAL LETTER S WITH CARON
*/
code = 0x160;
break;
case 139:
/*
* SINGLE LEFT-POINTING ANGLE QUOTATION MARK (lsaquo)
*/
code = 0x2039;
break;
case 140:
/*
* LATIN CAPITAL LIGATURE OE
*/
code = 0x152;
break;
case 142:
/*
* LATIN CAPITAL LETTER Z WITH CARON
*/
code = 0x17d;
break;
case 145:
/*
* LEFT SINGLE QUOTATION MARK (lsquo)
*/
code = 0x2018;
break;
case 146:
/*
* RIGHT SINGLE QUOTATION MARK (rsquo)
*/
code = 0x2019;
break;
case 147:
/*
* LEFT DOUBLE QUOTATION MARK (ldquo)
*/
code = 0x201c;
break;
case 148:
/*
* RIGHT DOUBLE QUOTATION MARK (rdquo)
*/
code = 0x201d;
break;
case 149:
/*
* BULLET (bull)
*/
code = 0x2022;
break;
case 150:
/*
* EN DASH (ndash)
*/
code = 0x2013;
break;
case 151:
/*
* EM DASH (mdash)
*/
code = 0x2014;
break;
case 152:
/*
* SMALL TILDE (tilde)
*/
code = 0x02dc;
break;
case 153:
/*
* TRADE MARK SIGN (trade)
*/
code = 0x2122;
break;
case 154:
/*
* LATIN SMALL LETTER S WITH CARON
*/
code = 0x161;
break;
case 155:
/*
* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (rsaquo)
*/
code = 0x203a;
break;
case 156:
/*
* LATIN SMALL LIGATURE OE
*/
code = 0x153;
break;
case 158:
/*
* LATIN SMALL LETTER Z WITH CARON
*/
code = 0x17e;
break;
case 159:
/*
* LATIN CAPITAL LETTER Y WITH DIAERESIS
*/
code = 0x178;
break;
default:
/*
* Undefined (by convention, use the replacement character).
*/
code = 0xfffd;
break;
}
}
return code;
}
/*
* Function to select a character set and then set the character handling and
* LYHaveCJKCharacterSet flag. - FM
*/
void HTMLUseCharacterSet(int i)
{
HTMLSetRawModeDefault(i);
p_entity_values = LYCharSets[i];
HTMLSetCharacterHandling(i); /* set LYRawMode and CJK attributes */
HTMLSetHaveCJKCharacterSet(i);
HTMLSetDisplayCharsetMatchLocale(i);
return;
}
/*
* Initializer, calls initialization function for the CHARTRANS handling. - KW
*/
int LYCharSetsDeclared(void)
{
UCInit();
return UCInitialized;
}
#ifdef USE_CHARSET_CHOICE
void init_charset_subsets(void)
{
int i, n;
int cur_display = 0;
int cur_assumed = 0;
/* add them to displayed values */
charset_subsets[UCLYhndl_for_unspec].hide_assumed = FALSE;
charset_subsets[current_char_set].hide_display = FALSE;
#ifndef ALL_CHARSETS_IN_O_MENU_SCREEN
/*all this stuff is for supporting old menu screen... */
for (i = 0; i < LYNumCharsets; ++i) {
if (charset_subsets[i].hide_display == FALSE) {
n = cur_display++;
if (i == current_char_set)
displayed_display_charset_idx = n;
display_charset_map[n] = i;
display_charset_choices[n] = LYchar_set_names[i];
}
if (charset_subsets[i].hide_assumed == FALSE) {
n = cur_assumed++;
assumed_doc_charset_map[n] = i;
assumed_charset_choices[n] = LYCharSet_UC[i].MIMEname;
charset_subsets[i].assumed_idx = n;
}
display_charset_choices[cur_display] = NULL;
assumed_charset_choices[cur_assumed] = NULL;
}
#endif
}
#endif /* USE_CHARSET_CHOICE */