/* * $LynxId: LYCharSets.c,v 1.66 2010/09/25 12:46:05 tom Exp $ */ #include #include #include #include #include #include #include #include #include #include #include #include HTkcode kanji_code = NOKANJI; BOOLEAN LYHaveCJKCharacterSet = FALSE; BOOLEAN DisplayCharsetMatchLocale = TRUE; BOOL force_old_UCLYhndl_on_reload = FALSE; int forced_UCLYhdnl; int LYNumCharsets = 0; /* Will be initialized later by UC_Register. */ int current_char_set = -1; /* will be intitialized later in LYMain.c */ int linedrawing_char_set = -1; const char **p_entity_values = NULL; /* Pointer, for HTML_put_entity() */ /* obsolete and probably not used(???) */ /* will be initialized in HTMLUseCharacterSet */ #ifdef USE_CHARSET_CHOICE charset_subset_t charset_subsets[MAXCHARSETS]; BOOL custom_display_charset = FALSE; BOOL custom_assumed_doc_charset = FALSE; #ifndef ALL_CHARSETS_IN_O_MENU_SCREEN int display_charset_map[MAXCHARSETS]; int assumed_doc_charset_map[MAXCHARSETS]; const char *display_charset_choices[MAXCHARSETS + 1]; const char *assumed_charset_choices[MAXCHARSETS + 1]; int displayed_display_charset_idx; #endif #endif /* USE_CHARSET_CHOICE */ /* * New character sets now declared with UCInit() in UCdomap.c * * INSTRUCTIONS for adding new character sets which do not have * Unicode tables now in UCdomap.h * * * [We hope you need not correct/add old-style mapping below as in ISO_LATIN1[] * or SevenBitApproximations[] any more - it works now via new chartrans * mechanism, but kept for compatibility only: we should cleanup the stuff, * but this is not so easy...] * * Currently we only declare some charset's properties here (such as MIME * names, etc.), it does not include real mapping. * * There is a place marked "Add your new character sets HERE" in this file. * Make up a character set and add it in the same style as the ISO_LATIN1 set * below, giving it a unique name. * * Add the name of the set to LYCharSets. Similarly add the appropriate * information to the tables below: LYchar_set_names, LYCharSet_UC, * LYlowest_eightbit. These 4 tables all MUST have the same order. (And this * is the order you will see in Lynx Options Menu, which is why few * unicode-based charsets are listed here). * */ /* Entity values -- for ISO Latin 1 local representation * * This MUST match exactly the table referred to in the DTD! */ static const char *ISO_Latin1[] = { "\306", /* capital AE diphthong (ligature) (Æ) - AElig */ "\301", /* capital A, acute accent (Á) - Aacute */ "\302", /* capital A, circumflex accent (Â) - Acirc */ "\300", /* capital A, grave accent (À) - Agrave */ "\305", /* capital A, ring - Aring (Å) */ "\303", /* capital A, tilde - Atilde (Ã) */ "\304", /* capital A, dieresis or umlaut mark (Ä) - Auml */ "\307", /* capital C, cedilla - Ccedil (Ç) */ "\320", /* capital Eth or D with stroke (Ð) - Dstrok */ "\320", /* capital Eth, Icelandic (Ð) - ETH */ "\311", /* capital E, acute accent (É) - Eacute */ "\312", /* capital E, circumflex accent (Ê) - Ecirc */ "\310", /* capital E, grave accent (È) - Egrave */ "\313", /* capital E, dieresis or umlaut mark (Ë) - Euml */ "\315", /* capital I, acute accent (Í) - Iacute */ "\316", /* capital I, circumflex accent (Î) - Icirc */ "\314", /* capital I, grave accent (Ì) - Igrave */ "\317", /* capital I, dieresis or umlaut mark (Ï) - Iuml */ "\321", /* capital N, tilde (Ñ) - Ntilde */ "\323", /* capital O, acute accent (Ó) - Oacute */ "\324", /* capital O, circumflex accent (Ô) - Ocirc */ "\322", /* capital O, grave accent (Ò) - Ograve */ "\330", /* capital O, slash (Ø) - Oslash */ "\325", /* capital O, tilde (Õ) - Otilde */ "\326", /* capital O, dieresis or umlaut mark (Ö) - Ouml */ "\336", /* capital THORN, Icelandic (Þ) - THORN */ "\332", /* capital U, acute accent (Ú) - Uacute */ "\333", /* capital U, circumflex accent (Û) - Ucirc */ "\331", /* capital U, grave accent (Ù) - Ugrave */ "\334", /* capital U, dieresis or umlaut mark (Ü) - Uuml */ "\335", /* capital Y, acute accent (Ý) - Yacute */ "\341", /* small a, acute accent (á) - aacute */ "\342", /* small a, circumflex accent (â) - acirc */ "\264", /* spacing acute (´) - acute */ "\346", /* small ae diphthong (ligature) (æ) - aelig */ "\340", /* small a, grave accent (à) - agrave */ "\046", /* ampersand (&) - amp */ "\345", /* small a, ring (å) - aring */ "\343", /* small a, tilde (ã) - atilde */ "\344", /* small a, dieresis or umlaut mark (ä) - auml */ "\246", /* broken vertical bar (¦) - brkbar */ "\246", /* broken vertical bar (¦) - brvbar */ "\347", /* small c, cedilla (ç) - ccedil */ "\270", /* spacing cedilla (¸) - cedil */ "\242", /* cent sign (¢) - cent */ "\251", /* copyright sign (©) - copy */ "\244", /* currency sign (¤) - curren */ "\260", /* degree sign (°) - deg */ "\250", /* spacing dieresis (¨) - die */ "\367", /* division sign (÷) - divide */ "\351", /* small e, acute accent (é) - eacute */ "\352", /* small e, circumflex accent (ê) - ecirc */ "\350", /* small e, grave accent (è) - egrave */ "-", /* dash the width of emsp - emdash */ "\002", /* emsp, em space - not collapsed NEVER CHANGE THIS - emsp */ "-", /* dash the width of ensp - endash */ "\002", /* ensp, en space - not collapsed NEVER CHANGE THIS - ensp */ "\360", /* small eth, Icelandic (ð) - eth */ "\353", /* small e, dieresis or umlaut mark (ë) - euml */ "\275", /* fraction 1/2 (½) - frac12 */ "\274", /* fraction 1/4 (¼) - frac14 */ "\276", /* fraction 3/4 (¾) - frac34 */ "\076", /* greater than (>) - gt */ "\257", /* spacing macron (¯) - hibar */ "\355", /* small i, acute accent (í) - iacute */ "\356", /* small i, circumflex accent (î) - icirc */ "\241", /* inverted exclamation mark (¡) - iexcl */ "\354", /* small i, grave accent (ì) - igrave */ "\277", /* inverted question mark (¿) - iquest */ "\357", /* small i, dieresis or umlaut mark (ï) - iuml */ "\253", /* angle quotation mark, left («) - laquo */ "\074", /* less than (<) - lt */ "\257", /* spacing macron (¯) - macr */ "-", /* dash the width of emsp - mdash */ "\265", /* micro sign (µ) - micro */ "\267", /* middle dot (·) - middot */ "\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */ "-", /* dash the width of ensp - ndash */ "\254", /* negation sign (¬) - not */ "\361", /* small n, tilde (ñ) - ntilde */ "\363", /* small o, acute accent (ó) - oacute */ "\364", /* small o, circumflex accent (ô) - ocirc */ "\362", /* small o, grave accent (ò) - ograve */ "\252", /* feminine ordinal indicator (ª) - ordf */ "\272", /* masculine ordinal indicator (º) - ordm */ "\370", /* small o, slash (ø) - oslash */ "\365", /* small o, tilde (õ) - otilde */ "\366", /* small o, dieresis or umlaut mark (ö) - ouml */ "\266", /* paragraph sign (¶) - para */ "\261", /* plus-or-minus sign (±) - plusmn */ "\243", /* pound sign (£) - pound */ "\042", /* quote '"' (") - quot */ "\273", /* angle quotation mark, right (») - raquo */ "\256", /* circled R registered sign (®) - reg */ "\247", /* section sign (§) - sect */ "\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */ "\271", /* superscript 1 (¹) - sup1 */ "\262", /* superscript 2 (²) - sup2 */ "\263", /* superscript 3 (³) - sup3 */ "\337", /* small sharp s, German (sz ligature) (ß) - szlig */ "\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */ "\376", /* small thorn, Icelandic (þ) - thorn */ "\327", /* multiplication sign (×) - times */ "(TM)", /* circled TM trade mark sign (™) - trade */ "\372", /* small u, acute accent (ú) - uacute */ "\373", /* small u, circumflex accent (û) - ucirc */ "\371", /* small u, grave accent (ù) - ugrave */ "\250", /* spacing dieresis (¨) - uml */ "\374", /* small u, dieresis or umlaut mark (ü) - uuml */ "\375", /* small y, acute accent (ý) - yacute */ "\245", /* yen sign (¥) - yen */ "\377", /* small y, dieresis or umlaut mark (ÿ) - yuml */ }; /* Entity values -- 7 bit character approximations * * This MUST match exactly the table referred to in the DTD! */ const char *SevenBitApproximations[] = { "AE", /* capital AE diphthong (ligature) (Æ) - AElig */ "A", /* capital A, acute accent (Á) - Aacute */ "A", /* capital A, circumflex accent (Â) - Acirc */ "A", /* capital A, grave accent (À) - Agrave */ "A", /* capital A, ring - Aring (Å) */ "A", /* capital A, tilde - Atilde (Ã) */ #ifdef LY_UMLAUT "Ae", /* capital A, dieresis or umlaut mark (Ä) - Auml */ #else "A", /* capital A, dieresis or umlaut mark (Ä) - Auml */ #endif /* LY_UMLAUT */ "C", /* capital C, cedilla (Ç) - Ccedil */ "Dj", /* capital D with stroke (Ð) - Dstrok */ "DH", /* capital Eth, Icelandic (Ð) - ETH */ "E", /* capital E, acute accent (É) - Eacute */ "E", /* capital E, circumflex accent (Ê) - Ecirc */ "E", /* capital E, grave accent (È) - Egrave */ "E", /* capital E, dieresis or umlaut mark (Ë) - Euml */ "I", /* capital I, acute accent (Í) - Iacute */ "I", /* capital I, circumflex accent (Î) - Icirc */ "I", /* capital I, grave accent (Ì) - Igrave */ "I", /* capital I, dieresis or umlaut mark (Ï) - Iuml */ "N", /* capital N, tilde - Ntilde (Ñ) */ "O", /* capital O, acute accent (Ó) - Oacute */ "O", /* capital O, circumflex accent (Ô) - Ocirc */ "O", /* capital O, grave accent (Ò) - Ograve */ "O", /* capital O, slash (Ø) - Oslash */ "O", /* capital O, tilde (Õ) - Otilde */ #ifdef LY_UMLAUT "Oe", /* capital O, dieresis or umlaut mark (Ö) - Ouml */ #else "O", /* capital O, dieresis or umlaut mark (Ö) - Ouml */ #endif /* LY_UMLAUT */ "P", /* capital THORN, Icelandic (Þ) - THORN */ "U", /* capital U, acute accent (Ú) - Uacute */ "U", /* capital U, circumflex accent (Û) - Ucirc */ "U", /* capital U, grave accent (Ù) - Ugrave */ #ifdef LY_UMLAUT "Ue", /* capital U, dieresis or umlaut mark (Ü) - Uuml */ #else "U", /* capital U, dieresis or umlaut mark (Ü) - Uuml */ #endif /* LY_UMLAUT */ "Y", /* capital Y, acute accent (Ý) - Yacute */ "a", /* small a, acute accent (á) - aacute */ "a", /* small a, circumflex accent (â) - acirc */ "'", /* spacing acute (´) - acute */ "ae", /* small ae diphthong (ligature) (æ) - aelig */ "`a", /* small a, grave accent (è) - agrave */ "&", /* ampersand (&) - amp */ "a", /* small a, ring (å) - aring */ "a", /* small a, tilde (ã) - atilde */ #ifdef LY_UMLAUT "ae", /* small a, dieresis or umlaut mark (ä) - auml */ #else "a", /* small a, dieresis or umlaut mark (ä) - auml */ #endif /* LY_UMLAUT */ "|", /* broken vertical bar (¦) - brkbar */ "|", /* broken vertical bar (¦) - brvbar */ "c", /* small c, cedilla (ç) - ccedil */ ",", /* spacing cedilla (¸) - cedil */ "-c-", /* cent sign (¢) - cent */ "(c)", /* copyright sign (©) - copy */ "CUR", /* currency sign (¤) - curren */ "DEG", /* degree sign (°) - deg */ "\042", /* spacing dieresis (¨) - die */ "/", /* division sign (÷) - divide */ "e", /* small e, acute accent (é) - eacute */ "e", /* small e, circumflex accent (ê) - ecirc */ "e", /* small e, grave accent (è) - egrave */ "-", /* dash the width of emsp - emdash */ "\002", /* emsp NEVER CHANGE THIS - emsp */ "-", /* dash the width of ensp - endash */ "\002", /* ensp NEVER CHANGE THIS - ensp */ "dh", /* small eth, Icelandic eth (ð) */ "e", /* small e, dieresis or umlaut mark (ë) - euml */ " 1/2", /* fraction 1/2 (½) - frac12 */ " 1/4", /* fraction 1/4 (¼) - frac14 */ " 3/4", /* fraction 3/4 (¾) - frac34 */ ">", /* greater than (>) - gt */ "-", /* spacing macron (¯) - hibar */ "i", /* small i, acute accent (í) - iacute */ "i", /* small i, circumflex accent (î) - icirc */ "!", /* inverted exclamation mark (¡) - iexcl */ "`i", /* small i, grave accent (ì) - igrave */ "?", /* inverted question mark (¿) - iquest */ "i", /* small i, dieresis or umlaut mark (ï) - iuml */ "<<", /* angle quotation mark, left («) - laquo */ "<", /* less than - lt (<) */ "-", /* spacing macron (¯) - macr */ "-", /* dash the width of emsp - mdash */ "u", /* micro sign (µ) - micro */ ".", /* middle dot (·) - middot */ "\001", /* nbsp non-breaking space NEVER CHANGE THIS - nbsp */ "-", /* dash the width of ensp - ndash */ "NOT", /* negation sign (¬) - not */ "n", /* small n, tilde (ñ) - ntilde */ "o", /* small o, acute accent (ó) - oacute */ "o", /* small o, circumflex accent (ô) - ocirc */ "o", /* small o, grave accent (ò) - ograve */ "-a", /* feminine ordinal indicator (ª) - ordf */ "-o", /* masculine ordinal indicator (º) - ordm */ "o", /* small o, slash (ø) - oslash */ "o", /* small o, tilde (õ) - otilde */ #ifdef LY_UMLAUT "oe", /* small o, dieresis or umlaut mark (ö) - ouml */ #else "o", /* small o, dieresis or umlaut mark (ö) - ouml */ #endif /* LY_UMLAUT */ "P:", /* paragraph sign (¶) - para */ "+-", /* plus-or-minus sign (±) - plusmn */ "-L-", /* pound sign (£) - pound */ "\"", /* quote '"' (") - quot */ ">>", /* angle quotation mark, right (») - raquo */ "(R)", /* circled R registered sign (®) - reg */ "S:", /* section sign (§) - sect */ "\007", /* soft hyphen (­) NEVER CHANGE THIS - shy */ "^1", /* superscript 1 (¹) - sup1 */ "^2", /* superscript 2 (²) - sup2 */ "^3", /* superscript 3 (³) - sup3 */ "ss", /* small sharp s, German (sz ligature) (ß) - szlig */ "\002", /* thin space - not collapsed NEVER CHANGE THIS - thinsp */ "p", /* small thorn, Icelandic (þ) - thorn */ "*", /* multiplication sign (×) - times */ "(TM)", /* circled TM trade mark sign (™) - trade */ "u", /* small u, acute accent (ú) - uacute */ "u", /* small u, circumflex accent (û) - ucirc */ "u", /* small u, grave accent (ù) - ugrave */ "\042", /* spacing dieresis (¨) - uml */ #ifdef LY_UMLAUT "ue", /* small u, dieresis or umlaut mark (ü) - uuml */ #else "u", /* small u, dieresis or umlaut mark (ü) - uuml */ #endif /* LY_UMLAUT */ "y", /* small y, acute accent (ý) - yacute */ "YEN", /* yen sign (¥) - yen */ "y", /* small y, dieresis or umlaut mark (ÿ) - yuml */ }; /* * Add your new character sets HERE (but only if you can't construct Unicode * tables for them). - FM */ /* * Add the array name to LYCharSets */ const char **LYCharSets[MAXCHARSETS] = { ISO_Latin1, /* ISO Latin 1 */ SevenBitApproximations, /* 7 Bit Approximations */ }; /* * Add the name that the user will see below. The order of LYCharSets and * LYchar_set_names MUST be the same */ const char *LYchar_set_names[MAXCHARSETS + 1] = { "Western (ISO-8859-1)", "7 bit approximations (US-ASCII)", (char *) 0 }; /* * Associate additional pieces of info with each of the charsets listed above. * Will be automatically modified (and extended) by charset translations which * are loaded using the chartrans mechanism. Most important piece of info to * put here is a MIME charset name. Used for chartrans (see UCDefs.h). The * order of LYCharSets and LYCharSet_UC MUST be the same. * * Note that most of the charsets added by the new mechanism in src/chrtrans * don't show up here at all. They don't have to. */ LYUCcharset LYCharSet_UC[MAXCHARSETS] = { /* * Zero position placeholder and HTMLGetEntityUCValue() reference. - FM */ {-1, "iso-8859-1", UCT_ENC_8BIT, 0, UCT_REP_IS_LAT1, UCT_CP_IS_LAT1, UCT_R_LAT1, UCT_R_LAT1}, /* * Placeholders for Unicode tables. - FM */ {-1, "us-ascii", UCT_ENC_7BIT, 0, UCT_REP_SUBSETOF_LAT1, UCT_CP_SUBSETOF_LAT1, UCT_R_ASCII, UCT_R_ASCII}, }; /* * Add the code of the the lowest character with the high bit set that can be * directly displayed. The order of LYCharSets and LYlowest_eightbit MUST be * the same. * * (If charset have chartrans unicode table, LYlowest_eightbit will be * verified/modified anyway.) */ int LYlowest_eightbit[MAXCHARSETS] = { 160, /* ISO Latin 1 */ 999, /* 7 bit approximations */ }; /* * Function to set the handling of selected character sets based on the current * LYUseDefaultRawMode value. - FM */ void HTMLSetCharacterHandling(int i) { int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset); BOOLEAN LYRawMode_flag = LYRawMode; int UCLYhndl_for_unspec_flag = UCLYhndl_for_unspec; if (LYCharSet_UC[i].enc != UCT_ENC_CJK) { HTCJK = NOCJK; kanji_code = NOKANJI; if (i == chndl) LYRawMode = LYUseDefaultRawMode; else LYRawMode = (BOOL) (!LYUseDefaultRawMode); HTPassEightBitNum = (BOOL) ((LYCharSet_UC[i].codepoints & UCT_CP_SUPERSETOF_LAT1) || (LYCharSet_UC[i].like8859 & UCT_R_HIGH8BIT)); if (LYRawMode) { HTPassEightBitRaw = (BOOL) (LYlowest_eightbit[i] <= 160); } else { HTPassEightBitRaw = FALSE; } if (LYRawMode || i == chndl) { HTPassHighCtrlRaw = (BOOL) (LYlowest_eightbit[i] <= 130); } else { HTPassHighCtrlRaw = FALSE; } HTPassHighCtrlNum = FALSE; } else { /* CJK encoding: */ const char *mime = LYCharSet_UC[i].MIMEname; if (!strcmp(mime, "euc-cn")) { HTCJK = CHINESE; kanji_code = EUC; } else if (!strcmp(mime, "euc-jp")) { HTCJK = JAPANESE; kanji_code = EUC; } else if (!strcmp(mime, "shift_jis")) { HTCJK = JAPANESE; kanji_code = SJIS; } else if (!strcmp(mime, "euc-kr")) { HTCJK = KOREAN; kanji_code = EUC; } else if (!strcmp(mime, "big5")) { HTCJK = TAIPEI; kanji_code = EUC; } /* for any CJK: */ if (!LYUseDefaultRawMode) HTCJK = NOCJK; LYRawMode = (BOOL) (IS_CJK_TTY ? TRUE : FALSE); HTPassEightBitRaw = FALSE; HTPassEightBitNum = FALSE; HTPassHighCtrlRaw = (BOOL) (IS_CJK_TTY ? TRUE : FALSE); HTPassHighCtrlNum = FALSE; } /* * Comment for coding below: * UCLYhndl_for_unspec is "current" state with LYRawMode, but * UCAssume_MIMEcharset is independent from LYRawMode: holds the history * and may be changed from 'O'ptions menu only. - LP */ if (LYRawMode) { UCLYhndl_for_unspec = i; /* UCAssume_MIMEcharset not changed! */ } else { if (chndl != i && (LYCharSet_UC[i].enc != UCT_ENC_CJK || LYCharSet_UC[chndl].enc != UCT_ENC_CJK)) { UCLYhndl_for_unspec = chndl; /* fall to UCAssume_MIMEcharset */ } else { UCLYhndl_for_unspec = LATIN1; /* UCAssume_MIMEcharset not changed! */ } } #ifdef USE_SLANG if (LYlowest_eightbit[i] > 191) { /* * Higher than this may output cntrl chars to screen. - KW */ SLsmg_Display_Eight_Bit = 191; } else { SLsmg_Display_Eight_Bit = LYlowest_eightbit[i]; } #endif /* USE_SLANG */ ena_csi(LYlowest_eightbit[current_char_set] > 155); /* some diagnostics */ if (TRACE) { if (LYRawMode_flag != LYRawMode) CTRACE((tfp, "HTMLSetCharacterHandling: LYRawMode changed %s -> %s\n", (LYRawMode_flag ? "ON" : "OFF"), (LYRawMode ? "ON" : "OFF"))); if (UCLYhndl_for_unspec_flag != UCLYhndl_for_unspec) CTRACE((tfp, "HTMLSetCharacterHandling: UCLYhndl_for_unspec changed %d -> %d\n", UCLYhndl_for_unspec_flag, UCLYhndl_for_unspec)); } return; } /* * Function to set HTCJK based on "in" and "out" charsets. */ void Set_HTCJK(const char *inMIMEname, const char *outMIMEname) { /* need not check for synonyms: MIMEname's got from LYCharSet_UC */ if (LYRawMode) { if ((!strcmp(inMIMEname, "euc-jp") || #ifdef EXP_JAPANESEUTF8_SUPPORT !strcmp(inMIMEname, "utf-8") || #endif !strcmp(inMIMEname, "shift_jis")) && (!strcmp(outMIMEname, "euc-jp") || !strcmp(outMIMEname, "shift_jis"))) { HTCJK = JAPANESE; } else if (!strcmp(inMIMEname, "euc-cn") && !strcmp(outMIMEname, "euc-cn")) { HTCJK = CHINESE; } else if (!strcmp(inMIMEname, "big5") && !strcmp(outMIMEname, "big5")) { HTCJK = TAIPEI; } else if (!strcmp(inMIMEname, "euc-kr") && !strcmp(outMIMEname, "euc-kr")) { HTCJK = KOREAN; } else { HTCJK = NOCJK; } } else { HTCJK = NOCJK; } } /* * Function to set the LYDefaultRawMode value based on the selected character * set. - FM * * Currently unused: the default value so obvious that LYUseDefaultRawMode * utilized directly by someone's mistake. - LP */ static void HTMLSetRawModeDefault(int i) { LYDefaultRawMode = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK); return; } /* * Function to set the LYUseDefaultRawMode value based on the selected * character set and the current LYRawMode value. - FM */ void HTMLSetUseDefaultRawMode(int i, int modeflag) { if (LYCharSet_UC[i].enc != UCT_ENC_CJK) { int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset); if (i == chndl) LYUseDefaultRawMode = (BOOLEAN) modeflag; else LYUseDefaultRawMode = (BOOL) (!modeflag); } else /* CJK encoding: */ LYUseDefaultRawMode = (BOOLEAN) modeflag; return; } /* * Function to set the LYHaveCJKCharacterSet value based on the selected * character set. - FM */ static void HTMLSetHaveCJKCharacterSet(int i) { LYHaveCJKCharacterSet = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK); return; } /* * Function to set the DisplayCharsetMatchLocale value based on the selected * character set. It is used in UPPER8 for 8bit case-insensitive search by * matching def7_uni.tbl images. - LP */ static void HTMLSetDisplayCharsetMatchLocale(int i) { BOOLEAN match; if (LYHaveCJKCharacterSet) { /* * We have no intention to pass CJK via UCTransChar if that happened. * Let someone from CJK correct this if necessary. */ DisplayCharsetMatchLocale = TRUE; /* old-style */ return; } else if (strncasecomp(LYCharSet_UC[i].MIMEname, "cp", 2) || strncasecomp(LYCharSet_UC[i].MIMEname, "windows", 7)) { /* * Assume dos/windows displays usually on remote terminal, hence it * rarely matches locale. (In fact, MS Windows codepoints locale are * never seen on UNIX). */ match = FALSE; } else { match = TRUE; /* guess, but see below */ #if !defined(LOCALE) if (LYCharSet_UC[i].enc != UCT_ENC_UTF8) /* * Leave true for utf-8 display - the code doesn't deal very well * with this case. - kw */ match = FALSE; #else if (UCForce8bitTOUPPER) { /* * Force disable locale (from lynx.cfg) */ match = FALSE; } #endif } DisplayCharsetMatchLocale = match; return; } /* * lynx 2.8/2.7.2(and more early) compatibility code: "human-readable" charset * names changes with time so we map that history names to MIME here to get old * lynx.cfg and (especially) .lynxrc always recognized. Please update this * table when you change "fullname" of any present charset. */ typedef struct _names_pairs { const char *fullname; const char *MIMEname; } names_pairs; /* *INDENT-OFF* */ static const names_pairs OLD_charset_names[] = { {"ISO Latin 1", "iso-8859-1"}, {"ISO Latin 2", "iso-8859-2"}, {"WinLatin1 (cp1252)", "windows-1252"}, {"DEC Multinational", "dec-mcs"}, {"Macintosh (8 bit)", "macintosh"}, {"NeXT character set", "next"}, {"KOI8-R Cyrillic", "koi8-r"}, {"Chinese", "euc-cn"}, {"Japanese (EUC)", "euc-jp"}, {"Japanese (SJIS)", "shift_jis"}, {"Korean", "euc-kr"}, {"Taipei (Big5)", "big5"}, {"Vietnamese (VISCII)", "viscii"}, {"7 bit approximations", "us-ascii"}, {"Transparent", "x-transparent"}, {"DosLatinUS (cp437)", "cp437"}, {"IBM PC character set", "cp437"}, {"DosLatin1 (cp850)", "cp850"}, {"IBM PC codepage 850", "cp850"}, {"DosLatin2 (cp852)", "cp852"}, {"PC Latin2 CP 852", "cp852"}, {"DosCyrillic (cp866)", "cp866"}, {"DosArabic (cp864)", "cp864"}, {"DosGreek (cp737)", "cp737"}, {"DosBaltRim (cp775)", "cp775"}, {"DosGreek2 (cp869)", "cp869"}, {"DosHebrew (cp862)", "cp862"}, {"WinLatin2 (cp1250)", "windows-1250"}, {"WinCyrillic (cp1251)", "windows-1251"}, {"WinGreek (cp1253)", "windows-1253"}, {"WinHebrew (cp1255)", "windows-1255"}, {"WinArabic (cp1256)", "windows-1256"}, {"WinBaltRim (cp1257)", "windows-1257"}, {"ISO Latin 3", "iso-8859-3"}, {"ISO Latin 4", "iso-8859-4"}, {"ISO 8859-5 Cyrillic", "iso-8859-5"}, {"ISO 8859-6 Arabic", "iso-8859-6"}, {"ISO 8859-7 Greek", "iso-8859-7"}, {"ISO 8859-8 Hebrew", "iso-8859-8"}, {"ISO-8859-8-I", "iso-8859-8"}, {"ISO-8859-8-E", "iso-8859-8"}, {"ISO 8859-9 (Latin 5)", "iso-8859-9"}, {"ISO 8859-10", "iso-8859-10"}, {"UNICODE UTF 8", "utf-8"}, {"RFC 1345 w/o Intro", "mnemonic+ascii+0"}, {"RFC 1345 Mnemonic", "mnemonic"}, {NULL, NULL}, /* terminated with NULL */ }; /* *INDENT-ON* */ /* * lynx 2.8/2.7.2 compatibility code: read "character_set" parameter from * lynx.cfg and .lynxrc in both MIME name and "human-readable" name (old and * new style). Returns -1 if not recognized. */ int UCGetLYhndl_byAnyName(char *value) { int i; LYTrimTrailing(value); if (value == NULL) return -1; CTRACE((tfp, "UCGetLYhndl_byAnyName(%s)\n", value)); /* search by name */ for (i = 0; (i < MAXCHARSETS && LYchar_set_names[i]); i++) { if (!strcmp(value, LYchar_set_names[i])) { return i; /* OK */ } } /* search by old name from 2.8/2.7.2 version */ for (i = 0; (OLD_charset_names[i].fullname); i++) { if (!strcmp(value, OLD_charset_names[i].fullname)) { return UCGetLYhndl_byMIME(OLD_charset_names[i].MIMEname); /* OK */ } } return UCGetLYhndl_byMIME(value); /* by MIME */ } /* * Entity names -- Ordered by ISO Latin 1 value. * --------------------------------------------- * For conversions of DECIMAL escaped entities. * Must be in order of ascending value. */ static const char *LYEntityNames[] = { /* NAME DECIMAL VALUE */ "nbsp", /* 160, non breaking space */ "iexcl", /* 161, inverted exclamation mark */ "cent", /* 162, cent sign */ "pound", /* 163, pound sign */ "curren", /* 164, currency sign */ "yen", /* 165, yen sign */ "brvbar", /* 166, broken vertical bar, (brkbar) */ "sect", /* 167, section sign */ "uml", /* 168, spacing dieresis */ "copy", /* 169, copyright sign */ "ordf", /* 170, feminine ordinal indicator */ "laquo", /* 171, angle quotation mark, left */ "not", /* 172, negation sign */ "shy", /* 173, soft hyphen */ "reg", /* 174, circled R registered sign */ "hibar", /* 175, spacing macron */ "deg", /* 176, degree sign */ "plusmn", /* 177, plus-or-minus sign */ "sup2", /* 178, superscript 2 */ "sup3", /* 179, superscript 3 */ "acute", /* 180, spacing acute (96) */ "micro", /* 181, micro sign */ "para", /* 182, paragraph sign */ "middot", /* 183, middle dot */ "cedil", /* 184, spacing cedilla */ "sup1", /* 185, superscript 1 */ "ordm", /* 186, masculine ordinal indicator */ "raquo", /* 187, angle quotation mark, right */ "frac14", /* 188, fraction 1/4 */ "frac12", /* 189, fraction 1/2 */ "frac34", /* 190, fraction 3/4 */ "iquest", /* 191, inverted question mark */ "Agrave", /* 192, capital A, grave accent */ "Aacute", /* 193, capital A, acute accent */ "Acirc", /* 194, capital A, circumflex accent */ "Atilde", /* 195, capital A, tilde */ "Auml", /* 196, capital A, dieresis or umlaut mark */ "Aring", /* 197, capital A, ring */ "AElig", /* 198, capital AE diphthong (ligature) */ "Ccedil", /* 199, capital C, cedilla */ "Egrave", /* 200, capital E, grave accent */ "Eacute", /* 201, capital E, acute accent */ "Ecirc", /* 202, capital E, circumflex accent */ "Euml", /* 203, capital E, dieresis or umlaut mark */ "Igrave", /* 204, capital I, grave accent */ "Iacute", /* 205, capital I, acute accent */ "Icirc", /* 206, capital I, circumflex accent */ "Iuml", /* 207, capital I, dieresis or umlaut mark */ "ETH", /* 208, capital Eth, Icelandic (or Latin2 Dstrok) */ "Ntilde", /* 209, capital N, tilde */ "Ograve", /* 210, capital O, grave accent */ "Oacute", /* 211, capital O, acute accent */ "Ocirc", /* 212, capital O, circumflex accent */ "Otilde", /* 213, capital O, tilde */ "Ouml", /* 214, capital O, dieresis or umlaut mark */ "times", /* 215, multiplication sign */ "Oslash", /* 216, capital O, slash */ "Ugrave", /* 217, capital U, grave accent */ "Uacute", /* 218, capital U, acute accent */ "Ucirc", /* 219, capital U, circumflex accent */ "Uuml", /* 220, capital U, dieresis or umlaut mark */ "Yacute", /* 221, capital Y, acute accent */ "THORN", /* 222, capital THORN, Icelandic */ "szlig", /* 223, small sharp s, German (sz ligature) */ "agrave", /* 224, small a, grave accent */ "aacute", /* 225, small a, acute accent */ "acirc", /* 226, small a, circumflex accent */ "atilde", /* 227, small a, tilde */ "auml", /* 228, small a, dieresis or umlaut mark */ "aring", /* 229, small a, ring */ "aelig", /* 230, small ae diphthong (ligature) */ "ccedil", /* 231, small c, cedilla */ "egrave", /* 232, small e, grave accent */ "eacute", /* 233, small e, acute accent */ "ecirc", /* 234, small e, circumflex accent */ "euml", /* 235, small e, dieresis or umlaut mark */ "igrave", /* 236, small i, grave accent */ "iacute", /* 237, small i, acute accent */ "icirc", /* 238, small i, circumflex accent */ "iuml", /* 239, small i, dieresis or umlaut mark */ "eth", /* 240, small eth, Icelandic */ "ntilde", /* 241, small n, tilde */ "ograve", /* 242, small o, grave accent */ "oacute", /* 243, small o, acute accent */ "ocirc", /* 244, small o, circumflex accent */ "otilde", /* 245, small o, tilde */ "ouml", /* 246, small o, dieresis or umlaut mark */ "divide", /* 247, division sign */ "oslash", /* 248, small o, slash */ "ugrave", /* 249, small u, grave accent */ "uacute", /* 250, small u, acute accent */ "ucirc", /* 251, small u, circumflex accent */ "uuml", /* 252, small u, dieresis or umlaut mark */ "yacute", /* 253, small y, acute accent */ "thorn", /* 254, small thorn, Icelandic */ "yuml", /* 255, small y, dieresis or umlaut mark */ }; /* * Function to return the entity names of ISO-8859-1 8-bit characters. - FM */ const char *HTMLGetEntityName(UCode_t code) { #define IntValue code int MaxValue = (TABLESIZE(LYEntityNames) - 1); if (IntValue < 0 || IntValue > MaxValue) { return ""; } return LYEntityNames[IntValue]; } /* * Function to return the UCode_t (long int) value for entity names. It * returns 0 if not found. * * unicode_entities[] handles all the names from old style entities[] too. * Lynx now calls unicode_entities[] only through this function: * HTMLGetEntityUCValue(). Note, we need not check for special characters here * in function or even before it, we should check them *after* invoking this * function, see put_special_unicodes() in SGML.c. * * In the future we will try to isolate all calls to entities[] in favor of new * unicode-based chartrans scheme. - LP */ UCode_t HTMLGetEntityUCValue(const char *name) { #include UCode_t value = 0; size_t i, high, low; int diff = 0; size_t number_of_unicode_entities = TABLESIZE(unicode_entities); /* * Make sure we have a non-zero length name. - FM */ if (isEmpty(name)) return (value); /* * Try UC_entity_info unicode_entities[]. */ for (low = 0, high = number_of_unicode_entities; high > low; diff < 0 ? (low = i + 1) : (high = i)) { /* * Binary search. */ i = (low + (high - low) / 2); diff = AS_cmp(unicode_entities[i].name, name); /* Case sensitive! */ if (diff == 0) { value = unicode_entities[i].code; break; } } return (value); } /* * Original comment - * Assume these are Microsoft code points, inflicted on us by FrontPage. - FM * * MS FrontPage uses syntax like ™ in 128-159 range and doesn't follow * Unicode standards for this area. Windows-1252 codepoints are assumed here. * * However see - * http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0 */ UCode_t LYcp1252ToUnicode(UCode_t code) { if ((code == 1) || (code > 127 && code < 160)) { switch (code) { case 1: /* * WHITE SMILING FACE */ code = 0x263a; break; case 128: /* * EURO currency sign */ code = 0x20ac; break; case 130: /* * SINGLE LOW-9 QUOTATION MARK (sbquo) */ code = 0x201a; break; case 131: /* * LATIN SMALL LETTER F WITH HOOK */ code = 0x192; break; case 132: /* * DOUBLE LOW-9 QUOTATION MARK (bdquo) */ code = 0x201e; break; case 133: /* * HORIZONTAL ELLIPSIS (hellip) */ code = 0x2026; break; case 134: /* * DAGGER (dagger) */ code = 0x2020; break; case 135: /* * DOUBLE DAGGER (Dagger) */ code = 0x2021; break; case 136: /* * MODIFIER LETTER CIRCUMFLEX ACCENT */ code = 0x2c6; break; case 137: /* * PER MILLE SIGN (permil) */ code = 0x2030; break; case 138: /* * LATIN CAPITAL LETTER S WITH CARON */ code = 0x160; break; case 139: /* * SINGLE LEFT-POINTING ANGLE QUOTATION MARK (lsaquo) */ code = 0x2039; break; case 140: /* * LATIN CAPITAL LIGATURE OE */ code = 0x152; break; case 142: /* * LATIN CAPITAL LETTER Z WITH CARON */ code = 0x17d; break; case 145: /* * LEFT SINGLE QUOTATION MARK (lsquo) */ code = 0x2018; break; case 146: /* * RIGHT SINGLE QUOTATION MARK (rsquo) */ code = 0x2019; break; case 147: /* * LEFT DOUBLE QUOTATION MARK (ldquo) */ code = 0x201c; break; case 148: /* * RIGHT DOUBLE QUOTATION MARK (rdquo) */ code = 0x201d; break; case 149: /* * BULLET (bull) */ code = 0x2022; break; case 150: /* * EN DASH (ndash) */ code = 0x2013; break; case 151: /* * EM DASH (mdash) */ code = 0x2014; break; case 152: /* * SMALL TILDE (tilde) */ code = 0x02dc; break; case 153: /* * TRADE MARK SIGN (trade) */ code = 0x2122; break; case 154: /* * LATIN SMALL LETTER S WITH CARON */ code = 0x161; break; case 155: /* * SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (rsaquo) */ code = 0x203a; break; case 156: /* * LATIN SMALL LIGATURE OE */ code = 0x153; break; case 158: /* * LATIN SMALL LETTER Z WITH CARON */ code = 0x17e; break; case 159: /* * LATIN CAPITAL LETTER Y WITH DIAERESIS */ code = 0x178; break; default: /* * Undefined (by convention, use the replacement character). */ code = 0xfffd; break; } } return code; } /* * Function to select a character set and then set the character handling and * LYHaveCJKCharacterSet flag. - FM */ void HTMLUseCharacterSet(int i) { HTMLSetRawModeDefault(i); p_entity_values = LYCharSets[i]; HTMLSetCharacterHandling(i); /* set LYRawMode and CJK attributes */ HTMLSetHaveCJKCharacterSet(i); HTMLSetDisplayCharsetMatchLocale(i); return; } /* * Initializer, calls initialization function for the CHARTRANS handling. - KW */ int LYCharSetsDeclared(void) { UCInit(); return UCInitialized; } #ifdef USE_CHARSET_CHOICE void init_charset_subsets(void) { int i, n; int cur_display = 0; int cur_assumed = 0; /* add them to displayed values */ charset_subsets[UCLYhndl_for_unspec].hide_assumed = FALSE; charset_subsets[current_char_set].hide_display = FALSE; #ifndef ALL_CHARSETS_IN_O_MENU_SCREEN /*all this stuff is for supporting old menu screen... */ for (i = 0; i < LYNumCharsets; ++i) { if (charset_subsets[i].hide_display == FALSE) { n = cur_display++; if (i == current_char_set) displayed_display_charset_idx = n; display_charset_map[n] = i; display_charset_choices[n] = LYchar_set_names[i]; } if (charset_subsets[i].hide_assumed == FALSE) { n = cur_assumed++; assumed_doc_charset_map[n] = i; assumed_charset_choices[n] = LYCharSet_UC[i].MIMEname; charset_subsets[i].assumed_idx = n; } display_charset_choices[cur_display] = NULL; assumed_charset_choices[cur_assumed] = NULL; } #endif } #endif /* USE_CHARSET_CHOICE */