#include #include #include #include #include #include #include #include #include #include #include #include #include #include extern BOOL HTPassEightBitRaw; extern BOOL HTPassEightBitNum; extern BOOL HTPassHighCtrlRaw; extern BOOL HTPassHighCtrlNum; extern HTCJKlang HTCJK; PUBLIC HTkcode kanji_code = NOKANJI; PUBLIC BOOLEAN LYHaveCJKCharacterSet = FALSE; PUBLIC BOOLEAN DisplayCharsetMatchLocale = TRUE; extern void UCInit NOARGS; extern int UCInitialized; PUBLIC int LYNumCharsets = 0; /* Will be initialized later by UC_Register. */ PUBLIC int current_char_set = -1; /* will be intitialized later in LYMain.c */ PUBLIC CONST char** p_entity_values = NULL; /* Pointer, for HTML_put_entity()*/ /* obsolete and probably not used(???) */ /* will be initialized in HTMLUseCharacterSet */ /* * INSTRUCTIONS for adding new character sets which do not have * Unicode tables. * * Currently we only declare some charset's properties here * (such as MIME names, etc.), it does not include real mapping. * * [We hope you need not correct/add old-style mapping * as in ISO_LATIN1[] or SevenBitApproximations[] any more - * it works now via new chartrans mechanism, but kept for compatibility only: * we should cleanup the stuff, but this is not so easy...] * * There is a place marked "Add your new character sets HERE" in this file. * Make up a character set and add it in the same * style as the ISO_LATIN1 set below, giving it a unique name. * * Add the name of the set to LYCharSets. * Similarly add the appropriate information to the tables below: * LYchar_set_names, LYCharSet_UC, LYlowest_eightbit. * These 4 tables all MUST have the same order. * (And this is the order you will see in Lynx Options Menu, * which is why few unicode-based charsets are listed here). * */ /* Entity values -- for ISO Latin 1 local representation ** ** This MUST match exactly the table referred to in the DTD! */ PRIVATE CONST char * ISO_Latin1[] = { "\306", /* capital AE diphthong (ligature) (Æ) - AElig */ "\301", /* capital A, acute accent (Á) - Aacute */ "\302", /* capital A, circumflex accent (Â) - Acirc */ "\300", /* capital A, grave accent (À) - Agrave */ "\305", /* capital A, ring - Aring (Å) */ "\303", /* capital A, tilde - Atilde (Ã) */ "\304", /* capital A, dieresis or umlaut mark (Ä) - Auml */ "\307", /* capital C, cedilla - Ccedil (Ç) */ "\320", /* capital Eth or D with stroke (Ð) - Dstrok */ "\320", /* capital Eth, Icelandic (Ð) - ETH */ "\311", /* capital E, acute accent (É) - Eacute */ "\312", /* capital E, circumflex accent (Ê) - Ecirc */ "\310", /* capital E, grave accent (È) - Egrave */ "\313", /* capital E, dieresis or umlaut mark (Ë) - Euml */ "\315", /* capital I, acute accent (Í) - Iacute */ "\316", /* capital I, circumflex accent (Î) - Icirc */ "\314", /* capital I, grave accent (È) - Igrave */ "\317", /* capital I, dieresis or umlaut mark (Ï) - Iuml */ "\321", /* capital N, tilde (Ñ) - Ntilde */ "\323", /* capital O, acute accent (Ó) - Oacute */ "\324", /* capital O, circumflex accent (Ô) - Ocirc */ "\322", /* capital O, grave accent (Ò) - Ograve */ "\330", /* capital O, slash (Ø) - Oslash */ "\325", /* capital O, tilde (Õ) - Otilde */ "\326", /* capital O, dieresis or umlaut mark (Ö) - Ouml */ "\336", /* capital THORN, Icelandic (Þ) - THORN */ "\332", /* capital U, acute accent (Ú) - Uacute */ "\333", /* capital U, circumflex accent (Û) - Ucirc */ "\331", /* capital U, grave accent (Ù) - Ugrave */ "\334", /* capital U, dieresis or umlaut mark (Ü) - Uuml */ "\335", /* capital Y, acute accent (Ý) - Yacute */ "\341", /* small a, acute accent (á) - aacute */ "\342", /* small a, circumflex accent (â) - acirc */ "\264", /* spacing acute (´) - acute */ "\346", /* small ae diphthong (ligature) (æ) - aelig */ "\340", /* small a, grave accent (à) - agrave */ "\046", /* ampersand (&) - amp */ "\345", /* small a, ring (å) - aring */ "\343", /* small a, tilde (ã) - atilde */ "\344", /* small a, dieresis or umlaut mark (ä) - auml */ "\246", /* broken vertical bar (¦) - brkbar */ "\246", /* broken vertical bar (¦) - brvbar */ "\347", /* small c, cedilla (ç) - ccedil */ "\270", /* spacing cedilla (¸) - cedil */ "\242", /* cent sign (¢) - cent */ "\251", /* copyright sign (©) - copy */ "\244", /* currency sign (¤) - curren */ "\260", /* degree sign (°) - deg */ "\250", /* spacing dieresis (¨) - die */ "\367", /* division sign (÷) - divide */ "\351", /* small e, acute accent (é) - eacute */ "\352", /* small e, circumflex accent (ê) - ecirc */ "\350", /* small e, grave accent (è) - egrave */ "-", /* dash the width of emsp - emdash */ "\002", /* emsp, em space - not collapsed NEVER CHANGE THIS - emsp */ "-", /* dash the width of ensp - endash */ "\002", /* ensp, en space - not collapsed NEVER CHANGE THIS - ensp */ "\360", /* small eth, Icelandic (ð) - eth */ "\353", /* small e, dieresis or umlaut mark (ë) - euml */ "\275", /* fraction 1/2 (½) - frac12 */ "\274", /* fraction 1/4 (¼) - frac14 */ "\276", /* fraction 3/4 (¾) - frac34 */ "\076", /* greater than (>) - gt */ "\257", /* spacing macron (¯) - hibar */ "\355", /* small i, acute accent (í) - iacute */ "\356", /* small i, circumflex accent (î) - icirc */ "\241", /* inverted exclamation mark (¡) - iexcl */ "\354", /* small i, grave accent (ì) - igrave */ "\277", /* inverted question mark (¿) - iquest */ "\357", /* small i, dieresis or umlaut mark (ï) - iuml */ "\253", /* angle quotation mark, left («) - laquo */ "\074", /* less than (<) - lt */ "\257", /* spacing macron (¯) - macr */ "-", /* dash the width of emsp - mdash */ "\265", /* micro sign (µ) - micro */ "\267", /* middle dot (·) - middot */ "\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */ "-", /* dash the width of ensp - ndash */ "\254", /* negation sign (¬) - not */ "\361", /* small n, tilde (ñ) - ntilde */ "\363", /* small o, acute accent (ó) - oacute */ "\364", /* small o, circumflex accent (ô) - ocirc */ "\362", /* small o, grave accent (ò) - ograve */ "\252", /* feminine ordinal indicator (ª) - ordf */ "\272", /* masculine ordinal indicator (º) - ordm */ "\370", /* small o, slash (ø) - oslash */ "\365", /* small o, tilde (õ) - otilde */ "\366", /* small o, dieresis or umlaut mark (ö) - ouml */ "\266", /* paragraph sign (¶) - para */ "\261", /* plus-or-minus sign (±) - plusmn */ "\243", /* pound sign (£) - pound */ "\042", /* quote '"' (") - quot */ "\273", /* angle quotation mark, right (») - raquo */ "\256", /* circled R registered sign (®) - reg */ "\247", /* section sign (§) - sect */ "\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */ "\271", /* superscript 1 (¹) - sup1 */ "\262", /* superscript 2 (²) - sup2 */ "\263", /* superscript 3 (³) - sup3 */ "\337", /* small sharp s, German (sz ligature) (ß) - szlig */ "\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */ "\376", /* small thorn, Icelandic (þ) - thorn */ "\327", /* multiplication sign (×) - times */ "(TM)", /* circled TM trade mark sign (™) - trade */ "\372", /* small u, acute accent (ú) - uacute */ "\373", /* small u, circumflex accent (û) - ucirc */ "\371", /* small u, grave accent (ù) - ugrave */ "\250", /* spacing dieresis (¨) - uml */ "\374", /* small u, dieresis or umlaut mark (ü) - uuml */ "\375", /* small y, acute accent (ý) - yacute */ "\245", /* yen sign (¥) - yen */ "\377", /* small y, dieresis or umlaut mark (ÿ) - yuml */ }; /* Entity values -- 7 bit character approximations ** ** This MUST match exactly the table referred to in the DTD! */ PUBLIC CONST char * SevenBitApproximations[] = { "AE", /* capital AE diphthong (ligature) (Æ) - AElig */ "A", /* capital A, acute accent (Á) - Aacute */ "A", /* capital A, circumflex accent (Â) - Acirc */ "A", /* capital A, grave accent (À) - Agrave */ "A", /* capital A, ring - Aring (Å) */ "A", /* capital A, tilde - Atilde (Ã) */ #ifdef LY_UMLAUT "Ae", /* capital A, dieresis or umlaut mark (Ä) - Auml*/ #else "A", /* capital A, dieresis or umlaut mark (Ä) - Auml*/ #endif /* LY_UMLAUT */ "C", /* capital C, cedilla (Ç) - Ccedil */ "Dj", /* capital D with stroke (Ð) - Dstrok */ "DH", /* capital Eth, Icelandic (Ð) - ETH */ "E", /* capital E, acute accent (É) - Eacute */ "E", /* capital E, circumflex accent (Ê) - Ecirc */ "E", /* capital E, grave accent (È) - Egrave */ "E", /* capital E, dieresis or umlaut mark (Ë) - Euml */ "I", /* capital I, acute accent (Í) - Iacute */ "I", /* capital I, circumflex accent (Î) - Icirc */ "I", /* capital I, grave accent (Ì) - Igrave */ "I", /* capital I, dieresis or umlaut mark (Ï) - Iuml */ "N", /* capital N, tilde - Ntilde (Ñ) */ "O", /* capital O, acute accent (Ó) - Oacute */ "O", /* capital O, circumflex accent (Ô) - Ocirc */ "O", /* capital O, grave accent (Ò) - Ograve */ "O", /* capital O, slash (Ø) - Oslash */ "O", /* capital O, tilde (Õ) - Otilde */ #ifdef LY_UMLAUT "Oe", /* capital O, dieresis or umlaut mark (Ö) - Ouml */ #else "O", /* capital O, dieresis or umlaut mark (Ö) - Ouml */ #endif /* LY_UMLAUT */ "P", /* capital THORN, Icelandic (Þ) - THORN */ "U", /* capital U, acute accent (Ú) - Uacute */ "U", /* capital U, circumflex accent (Û) - Ucirc */ "U", /* capital U, grave accent (Ù) - Ugrave */ #ifdef LY_UMLAUT "Ue", /* capital U, dieresis or umlaut mark (Ü) - Uuml */ #else "U", /* capital U, dieresis or umlaut mark (Ü) - Uuml */ #endif /* LY_UMLAUT */ "Y", /* capital Y, acute accent (Ý) - Yacute */ "a", /* small a, acute accent (á) - aacute */ "a", /* small a, circumflex accent (â) - acirc */ "'", /* spacing acute (´) - acute */ "ae", /* small ae diphthong (ligature) (æ) - aelig */ "`a", /* small a, grave accent (è) - agrave */ "&", /* ampersand (&) - amp */ "a", /* small a, ring (å) - aring */ "a", /* small a, tilde (ã) - atilde */ #ifdef LY_UMLAUT "ae", /* small a, dieresis or umlaut mark (ä) - auml */ #else "a", /* small a, dieresis or umlaut mark (ä) - auml */ #endif /* LY_UMLAUT */ "|", /* broken vertical bar (¦) - brkbar */ "|", /* broken vertical bar (¦) - brvbar */ "c", /* small c, cedilla (ç) - ccedil */ ",", /* spacing cedilla (¸) - cedil */ "-c-", /* cent sign (¢) - cent */ "(c)", /* copyright sign (©) - copy */ "CUR", /* currency sign (¤) - curren */ "DEG", /* degree sign (°) - deg */ "\042", /* spacing dieresis (¨) - die */ "/", /* division sign (÷) - divide */ "e", /* small e, acute accent (é) - eacute */ "e", /* small e, circumflex accent (ê) - ecirc */ "e", /* small e, grave accent (è) - egrave */ "-", /* dash the width of emsp - emdash */ "\002", /* emsp NEVER CHANGE THIS - emsp */ "-", /* dash the width of ensp - endash */ "\002", /* ensp NEVER CHANGE THIS - ensp */ "dh", /* small eth, Icelandic eth (ð) */ "e", /* small e, dieresis or umlaut mark (ë) - euml */ " 1/2", /* fraction 1/2 (½) - frac12 */ " 1/4", /* fraction 1/4 (¼) - frac14 */ " 3/4", /* fraction 3/4 (¾) - frac34 */ ">", /* greater than (>) - gt */ "-", /* spacing macron (¯) - hibar */ "i", /* small i, acute accent (í) - iacute */ "i", /* small i, circumflex accent (î) - icirc*/ "!", /* inverted exclamation mark (¡) - iexcl */ "`i", /* small i, grave accent (ì) - igrave */ "?", /* inverted question mark (¿) - iquest */ "i", /* small i, dieresis or umlaut mark (ï) - iuml */ "<<", /* angle quotation mark, left («) - laquo */ "<", /* less than - lt (<) */ "-", /* spacing macron (¯) - macr */ "-", /* dash the width of emsp - mdash */ "u", /* micro sign (µ) - micro */ ".", /* middle dot (·) - middot */ "\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */ "-", /* dash the width of ensp - ndash */ "NOT", /* negation sign (¬) - not */ "n", /* small n, tilde (ñ) - ntilde */ "o", /* small o, acute accent (ó) - oacute */ "o", /* small o, circumflex accent (ô) - ocirc */ "o", /* small o, grave accent (ò) - ograve */ "-a", /* feminine ordinal indicator (ª) - ordf */ "-o", /* masculine ordinal indicator (º) - ordm */ "o", /* small o, slash (ø) - oslash */ "o", /* small o, tilde (õ) - otilde */ #ifdef LY_UMLAUT "oe", /* small o, dieresis or umlaut mark (ö) - ouml */ #else "o", /* small o, dieresis or umlaut mark (ö) - ouml */ #endif /* LY_UMLAUT */ "P:", /* paragraph sign (¶) - para */ "+-", /* plus-or-minus sign (±) - plusmn */ "-L-", /* pound sign (£) - pound */ "\"", /* quote '"' (") - quot */ ">>", /* angle quotation mark, right (») - raquo */ "(R)", /* circled R registered sign (®) - reg */ "S:", /* section sign (§) - sect */ "\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */ "^1", /* superscript 1 (¹) - sup1 */ "^2", /* superscript 2 (²) - sup2 */ "^3", /* superscript 3 (³) - sup3 */ "ss", /* small sharp s, German (sz ligature) (ß) - szlig */ "\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */ "p", /* small thorn, Icelandic (þ) - thorn */ "*", /* multiplication sign (×) - times */ "(TM)", /* circled TM trade mark sign (™) - trade */ "u", /* small u, acute accent (ú) - uacute */ "u", /* small u, circumflex accent (û) - ucirc */ "u", /* small u, grave accent (ù) - ugrave */ "\042", /* spacing dieresis (¨) - uml */ #ifdef LY_UMLAUT "ue", /* small u, dieresis or umlaut mark (ü) - uuml */ #else "u", /* small u, dieresis or umlaut mark (ü) - uuml */ #endif /* LY_UMLAUT */ "y", /* small y, acute accent (ý) - yacute */ "YEN", /* yen sign (¥) - yen */ "y", /* small y, dieresis or umlaut mark (ÿ) - yuml */ }; /* * Add your new character sets HERE (but only if you * can't construct Unicode tables for them). - FM */ /* * Add the array name to LYCharSets */ PUBLIC CONST char ** LYCharSets[MAXCHARSETS]={ ISO_Latin1, /* ISO Latin 1 */ SevenBitApproximations, /* DosLatin1 (cp850) */ SevenBitApproximations, /* WinLatin1 (cp1252) */ SevenBitApproximations, /* DosLatinUS (cp437) */ SevenBitApproximations, /* DEC Multinational */ SevenBitApproximations, /* Macintosh (8 bit) */ SevenBitApproximations, /* NeXT character set */ SevenBitApproximations, /* Chinese */ SevenBitApproximations, /* Japanese (EUC-JP) */ SevenBitApproximations, /* Japanese (Shift_JIS) */ SevenBitApproximations, /* Korean */ SevenBitApproximations, /* Taipei (Big5) */ SevenBitApproximations, /* Vietnamese (VISCII) */ SevenBitApproximations, /* 7 Bit Approximations */ SevenBitApproximations, /* Transparent */ }; /* * Add the name that the user will see below. * The order of LYCharSets and LYchar_set_names MUST be the same */ PUBLIC CONST char * LYchar_set_names[MAXCHARSETS + 1]={ "Western (ISO-8859-1)", "Western (cp850)", "Western (windows-1252)", "IBM PC US codepage (cp437)", "DEC Multinational", "Macintosh (8 bit)", "NeXT character set", "Chinese", "Japanese (EUC-JP)", "Japanese (Shift_JIS)", "Korean", "Taipei (Big5)", "Vietnamese (VISCII)", "7 bit approximations (US-ASCII)", "Transparent", (char *) 0 }; /* * Associate additional pieces of info with each of the charsets listed * above. * Will be automatically modified (and extended) by charset translations * which are loaded using the chartrans mechanism. * Most important piece of info to put here is a MIME charset name. * Used for chartrans. * The order of LYCharSets and LYCharSet_UC MUST be the same. * * Note that most of the charsets added by the new mechanism in src/chrtrans * don't show up here at all. They don't have to. */ PUBLIC LYUCcharset LYCharSet_UC[MAXCHARSETS]= { /* * Zero position placeholder and HTMLGetEntityUCValue() reference. - FM */ {-1,"iso-8859-1", UCT_ENC_8BIT, UCT_REP_IS_LAT1, UCT_CP_IS_LAT1, UCT_R_LAT1,UCT_R_LAT1}, /* * Placeholders for Unicode tables. - FM */ {-1,"cp850", UCT_ENC_8BIT, UCT_REP_SUPERSETOF_LAT1, 0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"windows-1252", UCT_ENC_8BIT, UCT_REP_SUPERSETOF_LAT1, 0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"cp437", UCT_ENC_8BIT,0,0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"dec-mcs", UCT_ENC_8BIT,0,0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"macintosh", UCT_ENC_8BIT,0,0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"next", UCT_ENC_8BIT,0,0, UCT_R_8BIT,UCT_R_ASCII}, /* * There is no strict correlation for the next five, since the transfer * charset gets decoded into Display Char Set by the CJK code (separate * from Unicode mechanism). For now we use the MIME name that describes * what is output to the terminal. - KW */ {-1,"euc-cn", UCT_ENC_CJK,0,0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"euc-jp", UCT_ENC_CJK,0,0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"shift_jis", UCT_ENC_CJK,0,0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"euc-kr", UCT_ENC_CJK,0,0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"big5", UCT_ENC_CJK,0,0, UCT_R_8BIT,UCT_R_ASCII}, /* * Placeholders for Unicode tables. - FM */ {-1,"viscii", UCT_ENC_8BIT_C0,0,0, UCT_R_8BIT,UCT_R_ASCII}, {-1,"us-ascii", UCT_ENC_7BIT, UCT_REP_SUBSETOF_LAT1, UCT_CP_SUBSETOF_LAT1, UCT_R_ASCII,UCT_R_ASCII}, /* * Placeholder for non-translation mode. - FM */ {-1,"x-transparent", UCT_ENC_8BIT,0,0, UCT_R_8BIT,UCT_R_ASCII} }; /* * Add the code of the the lowest character with the high bit set * that can be directly displayed. * The order of LYCharSets and LYlowest_eightbit MUST be the same. * * (If charset have chartrans unicode table, * LYlowest_eightbit will be verified/modified anyway.) */ PUBLIC int LYlowest_eightbit[MAXCHARSETS]={ 160, /* ISO Latin 1 */ 128, /* DosLatin1 (cp850) */ 130, /* WinLatin1 (cp1252) */ 128, /* DosLatinUS (cp437) */ 160, /* DEC Multinational */ 128, /* Macintosh (8 bit) */ 128, /* NeXT character set */ 128, /* Chinese */ 128, /* Japanese (EUC) */ 128, /* Japanese (SJIS) */ 128, /* Korean */ 128, /* Taipei (Big5) */ 128, /* Vietnamese (VISCII) */ 999, /* 7 bit approximations */ 128 /* Transparent (???) */ }; /* * Function to set the handling of selected character sets * based on the current LYUseDefaultRawMode value. - FM */ PUBLIC void HTMLSetCharacterHandling ARGS1(int,i) { int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset); if (LYCharSet_UC[i].enc != UCT_ENC_CJK) { HTCJK = NOCJK; kanji_code = NOKANJI; if (i == chndl) LYRawMode = LYUseDefaultRawMode; else LYRawMode = (!LYUseDefaultRawMode); HTPassEightBitNum = ((LYCharSet_UC[i].codepoints & UCT_CP_SUPERSETOF_LAT1) || (LYCharSet_UC[i].like8859 & UCT_R_HIGH8BIT)); if (LYRawMode) { HTPassEightBitRaw = (LYlowest_eightbit[i] <= 160); } else { HTPassEightBitRaw = FALSE; } if (LYRawMode || i == chndl) { HTPassHighCtrlRaw = (LYlowest_eightbit[i] <= 130); } else { HTPassHighCtrlRaw = FALSE; } HTPassHighCtrlNum = FALSE; } else { /* CJK encoding: */ CONST char *mime = LYCharSet_UC[i].MIMEname; if (!strcmp(mime, "euc-cn")) { HTCJK = CHINESE; kanji_code = EUC; } else if (!strcmp(mime, "euc-jp")) { HTCJK = JAPANESE; kanji_code = EUC; } else if (!strcmp(mime, "shift_jis")) { HTCJK = JAPANESE; kanji_code = SJIS; } else if (!strcmp(mime, "euc-kr")) { HTCJK = KOREAN; kanji_code = EUC; } else if (!strcmp(mime, "big5")) { HTCJK = TAIPEI; kanji_code = EUC; } /* for any CJK: */ if (!LYUseDefaultRawMode) HTCJK = NOCJK; LYRawMode = (HTCJK != NOCJK) ? TRUE : FALSE; HTPassEightBitRaw = FALSE; HTPassEightBitNum = FALSE; HTPassHighCtrlRaw = (HTCJK != NOCJK) ? TRUE : FALSE; HTPassHighCtrlNum = FALSE; } /* * Comment for coding below: * UCLYhndl_for_unspec is "current" state with LYRawMode, * but UCAssume_MIMEcharset is independent from LYRawMode: * holds the history and may be changed from 'O'ptions menu only. - LP */ if (LYRawMode) { UCLYhndl_for_unspec = i; /* UCAssume_MIMEcharset not changed! */ } else { if (chndl != i && (LYCharSet_UC[i].enc != UCT_ENC_CJK || LYCharSet_UC[chndl].enc != UCT_ENC_CJK)) { UCLYhndl_for_unspec = chndl; /* fall to UCAssume_MIMEcharset */ } else { UCLYhndl_for_unspec = LATIN1; /* UCAssume_MIMEcharset not changed! */ } } #ifdef USE_SLANG if (LYlowest_eightbit[i] > 191) { /* * Higher than this may output cntrl chars to screen. - KW */ SLsmg_Display_Eight_Bit = 191; } else { SLsmg_Display_Eight_Bit = LYlowest_eightbit[i]; } #endif /* USE_SLANG */ ena_csi((LYlowest_eightbit[current_char_set] > 155)); return; } /* * Function to set HTCJK based on "in" and "out" charsets. */ PUBLIC void Set_HTCJK ARGS2( CONST char *, inMIMEname, CONST char *, outMIMEname) { if (LYRawMode) { if ((!strcmp(inMIMEname, "euc-jp") || !strcmp(inMIMEname, "shift_jis")) && (!strcmp(outMIMEname, "euc-jp") || !strcmp(outMIMEname, "shift_jis"))) { HTCJK = JAPANESE; } else if (!strcmp(inMIMEname, "euc-cn") && !strcmp(outMIMEname, "euc-cn")) { HTCJK = CHINESE; } else if (!strcmp(inMIMEname, "big5") && !strcmp(outMIMEname, "big5")) { HTCJK = TAIPEI; } else if (!strcmp(inMIMEname, "euc-kr") && !strcmp(outMIMEname, "euc-kr")) { HTCJK = KOREAN; } else { HTCJK = NOCJK; } } else { HTCJK = NOCJK; } } /* * Function to set the LYDefaultRawMode value * based on the selected character set. - FM * * Currently unused: the default value so obvious * that LYUseDefaultRawMode utilized directly by someone's mistake. - LP */ PRIVATE void HTMLSetRawModeDefault ARGS1(int,i) { LYDefaultRawMode = (LYCharSet_UC[i].enc == UCT_ENC_CJK); return; } /* * Function to set the LYUseDefaultRawMode value * based on the selected character set and the * current LYRawMode value. - FM */ PUBLIC void HTMLSetUseDefaultRawMode ARGS2( int, i, BOOLEAN, modeflag) { if (LYCharSet_UC[i].enc != UCT_ENC_CJK) { int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset); if (i == chndl) LYUseDefaultRawMode = modeflag; else LYUseDefaultRawMode = (!modeflag); } else /* CJK encoding: */ LYUseDefaultRawMode = modeflag; return; } /* * Function to set the LYHaveCJKCharacterSet value * based on the selected character set. - FM */ PRIVATE void HTMLSetHaveCJKCharacterSet ARGS1(int,i) { LYHaveCJKCharacterSet = (LYCharSet_UC[i].enc == UCT_ENC_CJK); return; } /* * Function to set the DisplayCharsetMatchLocale value * based on the selected character set. * It is used in UPPER8 for 8bit case-insensitive search * by matching def7_uni.tbl images. - LP */ PRIVATE void HTMLSetDisplayCharsetMatchLocale ARGS1(int,i) { BOOLEAN match; if (LYHaveCJKCharacterSet) { /* ** We have no intention to pass CJK via UCTransChar if that happened. ** Let someone from CJK correct this if necessary. */ DisplayCharsetMatchLocale = TRUE; /* old-style */ return; } else if (strncasecomp(LYCharSet_UC[i].MIMEname, "cp", 2) || strncasecomp(LYCharSet_UC[i].MIMEname, "windows", 7)) { /* ** Assume dos/windows displays usually on remote terminal, hence it ** rarely matches locale. (In fact, MS Windows codepoints locale are ** never seen on UNIX). */ match = FALSE; } else { match = TRUE; /* guess, but see below */ #if !defined(LOCALE) match = FALSE; #else if (UCForce8bitTOUPPER) { /* ** Force disable locale (from lynx.cfg) */ match = FALSE; } #endif } DisplayCharsetMatchLocale = match; return; } /* * lynx 2.8/2.7.2(and more early) compatibility code: * "human-readable" charset names changes with time * so we map that history names to MIME here * to get old lynx.cfg and (especially) .lynxrc always recognized. * Please update this table when you change "fullname" of any present charset. */ typedef struct _names_pairs { CONST char * fullname; CONST char * MIMEname; } names_pairs; PRIVATE CONST names_pairs OLD_charset_names[] = { {"ISO Latin 1", "iso-8859-1"}, {"ISO Latin 2", "iso-8859-2"}, {"WinLatin1 (cp1252)", "windows-1252"}, {"DEC Multinational", "dec-mcs"}, {"Macintosh (8 bit)", "macintosh"}, {"NeXT character set", "next"}, {"KOI8-R Cyrillic", "koi8-r"}, {"Chinese", "euc-cn"}, {"Japanese (EUC)", "euc-jp"}, {"Japanese (SJIS)", "shift_jis"}, {"Korean", "euc-kr"}, {"Taipei (Big5)", "big5"}, {"Vietnamese (VISCII)", "viscii"}, {"7 bit approximations","us-ascii"}, {"Transparent", "x-transparent"}, {"DosLatinUS (cp437)", "cp437"}, {"IBM PC character set","cp437"}, {"DosLatin1 (cp850)", "cp850"}, {"IBM PC codepage 850", "cp850"}, {"DosLatin2 (cp852)", "cp852"}, {"PC Latin2 CP 852", "cp852"}, {"DosCyrillic (cp866)", "cp866"}, {"DosArabic (cp864)", "cp864"}, {"DosGreek (cp737)", "cp737"}, {"DosBaltRim (cp775)", "cp775"}, {"DosGreek2 (cp869)", "cp869"}, {"DosHebrew (cp862)", "cp862"}, {"WinLatin2 (cp1250)", "windows-1250"}, {"WinCyrillic (cp1251)","windows-1251"}, {"WinGreek (cp1253)", "windows-1253"}, {"WinHebrew (cp1255)", "windows-1255"}, {"WinArabic (cp1256)", "windows-1256"}, {"WinBaltRim (cp1257)", "windows-1257"}, {"ISO Latin 3", "iso-8859-3"}, {"ISO Latin 4", "iso-8859-4"}, {"ISO 8859-5 Cyrillic", "iso-8859-5"}, {"ISO 8859-6 Arabic", "iso-8859-6"}, {"ISO 8859-7 Greek", "iso-8859-7"}, {"ISO 8859-8 Hebrew", "iso-8859-8"}, {"ISO 8859-9 (Latin 5)","iso-8859-9"}, {"ISO 8859-10", "iso-8859-10"}, {"UNICODE UTF 8", "utf-8"}, {"RFC 1345 w/o Intro", "mnemonic+ascii+0"}, {"RFC 1345 Mnemonic", "mnemonic"}, {NULL, NULL}, /* terminated with NULL */ }; /* * lynx 2.8/2.7.2 compatibility code: * read "character_set" parameter from lynx.cfg and .lynxrc * in both MIME name and "human-readable" name (old and new style). * Returns -1 if not recognized. */ PUBLIC int UCGetLYhndl_byAnyName ARGS1 (char *, value) { int i; LYTrimTrailing(value); if (value == NULL) return -1; /* search by name */ for (i = 0; (i < MAXCHARSETS && LYchar_set_names[i]); i++) { if (!strcmp(value, LYchar_set_names[i])) { return i; /* OK */ } } /* search by old name from 2.8/2.7.2 version */ for (i = 0; (OLD_charset_names[i].fullname); i++) { if (!strcmp(value, OLD_charset_names[i].fullname)) { return UCGetLYhndl_byMIME(OLD_charset_names[i].MIMEname); /* OK */ } } return UCGetLYhndl_byMIME(value); /* by MIME */ } /* * Entity names -- Ordered by ISO Latin 1 value. * --------------------------------------------- * For conversions of DECIMAL escaped entities. * Must be in order of ascending value. */ PRIVATE CONST char * LYEntityNames[] = { /* NAME DECIMAL VALUE */ "nbsp", /* 160, non breaking space */ "iexcl", /* 161, inverted exclamation mark */ "cent", /* 162, cent sign */ "pound", /* 163, pound sign */ "curren", /* 164, currency sign */ "yen", /* 165, yen sign */ "brvbar", /* 166, broken vertical bar, (brkbar) */ "sect", /* 167, section sign */ "uml", /* 168, spacing dieresis */ "copy", /* 169, copyright sign */ "ordf", /* 170, feminine ordinal indicator */ "laquo", /* 171, angle quotation mark, left */ "not", /* 172, negation sign */ "shy", /* 173, soft hyphen */ "reg", /* 174, circled R registered sign */ "hibar", /* 175, spacing macron */ "deg", /* 176, degree sign */ "plusmn", /* 177, plus-or-minus sign */ "sup2", /* 178, superscript 2 */ "sup3", /* 179, superscript 3 */ "acute", /* 180, spacing acute (96) */ "micro", /* 181, micro sign */ "para", /* 182, paragraph sign */ "middot", /* 183, middle dot */ "cedil", /* 184, spacing cedilla */ "sup1", /* 185, superscript 1 */ "ordm", /* 186, masculine ordinal indicator */ "raquo", /* 187, angle quotation mark, right */ "frac14", /* 188, fraction 1/4 */ "frac12", /* 189, fraction 1/2 */ "frac34", /* 190, fraction 3/4 */ "iquest", /* 191, inverted question mark */ "Agrave", /* 192, capital A, grave accent */ "Aacute", /* 193, capital A, acute accent */ "Acirc", /* 194, capital A, circumflex accent */ "Atilde", /* 195, capital A, tilde */ "Auml", /* 196, capital A, dieresis or umlaut mark */ "Aring", /* 197, capital A, ring */ "AElig", /* 198, capital AE diphthong (ligature) */ "Ccedil", /* 199, capital C, cedilla */ "Egrave", /* 200, capital E, grave accent */ "Eacute", /* 201, capital E, acute accent */ "Ecirc", /* 202, capital E, circumflex accent */ "Euml", /* 203, capital E, dieresis or umlaut mark */ "Igrave", /* 204, capital I, grave accent */ "Iacute", /* 205, capital I, acute accent */ "Icirc", /* 206, capital I, circumflex accent */ "Iuml", /* 207, capital I, dieresis or umlaut mark */ "ETH", /* 208, capital Eth, Icelandic (or Latin2 Dstrok) */ "Ntilde", /* 209, capital N, tilde */ "Ograve", /* 210, capital O, grave accent */ "Oacute", /* 211, capital O, acute accent */ "Ocirc", /* 212, capital O, circumflex accent */ "Otilde", /* 213, capital O, tilde */ "Ouml", /* 214, capital O, dieresis or umlaut mark */ "times", /* 215, multiplication sign */ "Oslash", /* 216, capital O, slash */ "Ugrave", /* 217, capital U, grave accent */ "Uacute", /* 218, capital U, acute accent */ "Ucirc", /* 219, capital U, circumflex accent */ "Uuml", /* 220, capital U, dieresis or umlaut mark */ "Yacute", /* 221, capital Y, acute accent */ "THORN", /* 222, capital THORN, Icelandic */ "szlig", /* 223, small sharp s, German (sz ligature) */ "agrave", /* 224, small a, grave accent */ "aacute", /* 225, small a, acute accent */ "acirc", /* 226, small a, circumflex accent */ "atilde", /* 227, small a, tilde */ "auml", /* 228, small a, dieresis or umlaut mark */ "aring", /* 229, small a, ring */ "aelig", /* 230, small ae diphthong (ligature) */ "ccedil", /* 231, small c, cedilla */ "egrave", /* 232, small e, grave accent */ "eacute", /* 233, small e, acute accent */ "ecirc", /* 234, small e, circumflex accent */ "euml", /* 235, small e, dieresis or umlaut mark */ "igrave", /* 236, small i, grave accent */ "iacute", /* 237, small i, acute accent */ "icirc", /* 238, small i, circumflex accent */ "iuml", /* 239, small i, dieresis or umlaut mark */ "eth", /* 240, small eth, Icelandic */ "ntilde", /* 241, small n, tilde */ "ograve", /* 242, small o, grave accent */ "oacute", /* 243, small o, acute accent */ "ocirc", /* 244, small o, circumflex accent */ "otilde", /* 245, small o, tilde */ "ouml", /* 246, small o, dieresis or umlaut mark */ "divide", /* 247, division sign */ "oslash", /* 248, small o, slash */ "ugrave", /* 249, small u, grave accent */ "uacute", /* 250, small u, acute accent */ "ucirc", /* 251, small u, circumflex accent */ "uuml", /* 252, small u, dieresis or umlaut mark */ "yacute", /* 253, small y, acute accent */ "thorn", /* 254, small thorn, Icelandic */ "yuml", /* 255, small y, dieresis or umlaut mark */ }; /* * Function to return the entity names of * ISO-8859-1 8-bit characters. - FM */ PUBLIC CONST char * HTMLGetEntityName ARGS1( UCode_t, code) { #define IntValue code int MaxValue = ((sizeof(LYEntityNames)/sizeof(char **)) - 1); if (IntValue < 0 || IntValue > MaxValue) { return ""; } return LYEntityNames[IntValue]; } /* * Function to return the UCode_t (long int) value for entity names * in the ISO_Latin1 and UC_entity_info unicode_entities arrays. * It returns 0 if not found. - FM * * unicode_entities[] handles all the names from old style entities[] too. * Lynx now calls unicode_entities[] only through this function: * HTMLGetEntityUCValue(). Note, we need not check for special characters * here in function or even before it, we should check them *after* * invoking this function, see put_special_unicodes() in SGML.c. * * In the future we will try to isolate all calls to entities[] * in favor of new unicode-based chartrans scheme. - LP */ PUBLIC UCode_t HTMLGetEntityUCValue ARGS1( CONST char *, name) { UCode_t value = 0; size_t i, high, low; int diff = 0; CONST UC_entity_info * unicode_entities = HTML_dtd.unicode_entity_info; /* * Make sure we have a non-zero length name. - FM */ if (!(name && *name)) return(value); /* * Try UC_entity_info unicode_entities[]. */ #ifdef NOT_ASCII /* S/390 -- gil -- 1656 */ for (i = 0; i < HTML_dtd.number_of_unicode_entities; i++ ) { /* ** Linear search for NOT_ASCII. */ #else /* NOT_ASCII */ for (low = 0, high = HTML_dtd.number_of_unicode_entities; high > low; diff < 0 ? (low = i+1) : (high = i)) { /* ** Binary search. */ i = (low + (high-low)/2); #endif /* NOT_ASCII S/390 -- gil -- 1662 */ diff = strcmp(unicode_entities[i].name, name); /* Case sensitive! */ if (diff == 0) { value = unicode_entities[i].code; break; } } /* } NOT_ASCII */ return(value); } /* * Function to select a character set and then set the * character handling and LYHaveCJKCharacterSet flag. - FM */ PUBLIC void HTMLUseCharacterSet ARGS1(int, i) { HTMLSetRawModeDefault(i); p_entity_values = LYCharSets[i]; HTMLSetCharacterHandling(i); /* set LYRawMode and CJK attributes */ HTMLSetHaveCJKCharacterSet(i); HTMLSetDisplayCharsetMatchLocale(i); return; } /* * Initializer, calls initialization function for the * CHARTRANS handling. - KW */ PUBLIC int LYCharSetsDeclared NOPARAMS { UCInit(); return UCInitialized; }