diff options
Diffstat (limited to 'WWW')
-rw-r--r-- | WWW/Library/Implementation/HTCJK.h | 34 | ||||
-rw-r--r-- | WWW/Library/Implementation/HTMIME.c | 4 | ||||
-rw-r--r-- | WWW/Library/Implementation/HTUtils.h | 4 | ||||
-rw-r--r-- | WWW/Library/Implementation/SGML.c | 104 | ||||
-rw-r--r-- | WWW/Library/Implementation/UCDefs.h | 4 | ||||
-rw-r--r-- | WWW/Library/Implementation/UCMap.h | 4 |
6 files changed, 109 insertions, 45 deletions
diff --git a/WWW/Library/Implementation/HTCJK.h b/WWW/Library/Implementation/HTCJK.h index 06d816f5..11a43c52 100644 --- a/WWW/Library/Implementation/HTCJK.h +++ b/WWW/Library/Implementation/HTCJK.h @@ -1,5 +1,5 @@ /* - * $LynxId: HTCJK.h,v 1.19 2012/08/15 23:11:03 tom Exp $ + * $LynxId: HTCJK.h,v 1.21 2021/06/30 17:16:36 tom Exp $ * * CJK character converter HTCJK.h * ======================= @@ -32,21 +32,29 @@ extern "C" { #define TO_HANJI "\033$A" #define TO_HANGUL "\033$(C" #define TO_ASCII "\033(B" -#define IS_SJIS_LO(lo) ((0x40<=lo)&&(lo!=0x7F)&&(lo<=0xFC)) + +#define IS_GBK_LO(lo) ((0xA1 <= (lo)) && ((lo) <= 0xFE)) +#define IS_GBK_HI(hi) ((0xA1 <= (hi)) && ((hi) <= 0xF7)) + +#define IS_SJIS_LO(lo) ((0x40 <= (lo)) && ((lo) != 0x7F) && ((lo) <= 0xFC)) #define IS_SJIS_HI1(hi) ((0x81 <= (hi)) && ((hi) <= 0x9F)) /* 1st lev. */ #define IS_SJIS_HI2(hi) ((0xE0 <= (hi)) && ((hi) <= 0xEF)) /* 2nd lev. */ -#define IS_SJIS(hi,lo,in_sjis) (!IS_SJIS_LO(lo)?0:IS_SJIS_HI1(hi)?(in_sjis=1):in_sjis&&IS_SJIS_HI2(hi)) -#define IS_SJIS_2BYTE(hi,lo) (IS_SJIS_LO(lo)&&(IS_SJIS_HI1(hi)||IS_SJIS_HI2(hi))) -#define IS_SJIS_X0201KANA(lo) ((0xA1<=lo)&&(lo<=0xDF)) -#define IS_EUC_LOX(lo) ((0xA1<=lo)&&(lo<=0xFE)) /* extended */ -#define IS_EUC_HI(hi) ((0xA1<=hi)&&(hi<=0xFE)) -#define IS_EUC_X0201KANA(hi,lo) ((hi==0x8E)&&(0xA1<=lo)&&(lo<=0xDF)) -#define IS_EUC(hi,lo) ((IS_EUC_HI(hi) && IS_EUC_LOX(lo))||IS_EUC_X0201KANA(hi,lo)) +#define IS_SJIS(hi,lo,in_sjis) (!IS_SJIS_LO(lo) ? 0 : IS_SJIS_HI1(hi) ? (in_sjis=1) : in_sjis && IS_SJIS_HI2(hi)) +#define IS_SJIS_2BYTE(hi,lo) (IS_SJIS_LO(lo) && (IS_SJIS_HI1(hi) || IS_SJIS_HI2(hi))) +#define IS_SJIS_X0201KANA(lo) ((0xA1 <= (lo)) && ((lo) <= 0xDF)) + +#define IS_EUC_LOX(lo) ((0xA1 <= (lo)) && ((lo) <= 0xFE)) /* extended */ +#define IS_EUC_HI(hi) ((0xA1 <= (hi)) && ((hi) <= 0xFE)) +#define IS_EUC_X0201KANA(hi,lo) (((hi) == 0x8E) && (0xA1 <= (lo)) && ((lo) <= 0xDF)) +#define IS_EUC(hi,lo) ((IS_EUC_HI(hi) && IS_EUC_LOX(lo)) || IS_EUC_X0201KANA(hi,lo)) + #define IS_JAPANESE_2BYTE(hi,lo) (IS_SJIS_2BYTE(hi,lo) || IS_EUC(hi,lo)) -#define IS_BIG5_LOS(lo) ((0x40<=lo)&&(lo<=0x7E)) /* standard */ -#define IS_BIG5_LOX(lo) ((0xA1<=lo)&&(lo<=0xFE)) /* extended */ -#define IS_BIG5_HI(hi) ((0xA1<=hi)&&(hi<=0xFE)) -#define IS_BIG5(hi,lo) (IS_BIG5_HI(hi) && (IS_BIG5_LOS(lo) || IS_BIG5_LOX(lo))) + +#define IS_BIG5_LOS(lo) ((0x40 <= (lo)) && ((lo) <= 0x7E)) /* standard */ +#define IS_BIG5_LOX(lo) ((0xA1 <= (lo)) && ((lo) <= 0xFE)) /* extended */ +#define IS_BIG5_HI(hi) ((0xA1 <= (hi)) && ((hi) <= 0xFE)) +#define IS_BIG5(hi,lo) (IS_BIG5_HI(hi) && (IS_BIG5_LOS(lo) || IS_BIG5_LOX(lo))) + typedef enum { NOKANJI = 0, EUC, SJIS, JIS } HTkcode; diff --git a/WWW/Library/Implementation/HTMIME.c b/WWW/Library/Implementation/HTMIME.c index d8fe9978..cce691c2 100644 --- a/WWW/Library/Implementation/HTMIME.c +++ b/WWW/Library/Implementation/HTMIME.c @@ -1,5 +1,5 @@ /* - * $LynxId: HTMIME.c,v 1.100 2018/03/11 21:32:38 tom Exp $ + * $LynxId: HTMIME.c,v 1.101 2021/06/29 22:01:12 tom Exp $ * * MIME Message Parse HTMIME.c * ================== @@ -389,7 +389,7 @@ static int pumpData(HTStream *me) UCT_SETBY_DEFAULT); } if ((p_in->enc != UCT_ENC_CJK) -#ifdef EXP_JAPANESEUTF8_SUPPORT +#ifdef USE_JAPANESEUTF8_SUPPORT && ((p_in->enc != UCT_ENC_UTF8) || (p_out->enc != UCT_ENC_CJK)) #endif diff --git a/WWW/Library/Implementation/HTUtils.h b/WWW/Library/Implementation/HTUtils.h index 5aedc2f4..d01d0ddd 100644 --- a/WWW/Library/Implementation/HTUtils.h +++ b/WWW/Library/Implementation/HTUtils.h @@ -1,5 +1,5 @@ /* - * $LynxId: HTUtils.h,v 1.133 2021/06/09 22:17:19 tom Exp $ + * $LynxId: HTUtils.h,v 1.134 2021/06/29 22:01:12 tom Exp $ * * Utility macros for the W3 code library * MACROS FOR GENERAL USE @@ -145,7 +145,7 @@ char *alloca(); #endif #ifndef HAVE_ICONV -#undef EXP_JAPANESEUTF8_SUPPORT +#undef USE_JAPANESEUTF8_SUPPORT #endif #ifndef lynx_srand diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c index 7f107f44..7d1d44a8 100644 --- a/WWW/Library/Implementation/SGML.c +++ b/WWW/Library/Implementation/SGML.c @@ -1,5 +1,5 @@ /* - * $LynxId: SGML.c,v 1.169 2020/01/21 22:06:39 tom Exp $ + * $LynxId: SGML.c,v 1.172 2021/06/30 20:25:01 tom Exp $ * * General SGML Parser code SGML.c * ======================== @@ -38,6 +38,12 @@ # include <LYPrettySrc.h> #endif +/* a global variable doesn't work with info-stages which convert encoding */ +#if defined(EXP_CHINESEUTF8_SUPPORT) +#undef IS_CJK_TTY +#define IS_CJK_TTY me->T.do_cjk +#endif + #define AssumeCP1252(me) \ (((me)->inUCLYhndl == LATIN1 \ || (me)->inUCLYhndl == US_ASCII) \ @@ -93,7 +99,7 @@ static void fake_put_character(HTStream *p GCC_UNUSED, /*the following macros are used for pretty source view. */ #define IS_C(attr) (attr.type == HTMLA_CLASS) -#if defined(EXP_JAPANESEUTF8_SUPPORT) +#if defined(USE_JAPANESEUTF8_SUPPORT) # define UTF8_TTY_ISO2022JP (me->T.output_utf8) #else # define UTF8_TTY_ISO2022JP 0 @@ -402,7 +408,7 @@ static void set_chartrans_handling(HTStream *me, * would be better to call a Lynx_HTML_parser function to set an element in * its HTStructured object, itself, if this were needed. - FM */ -#ifndef EXP_JAPANESEUTF8_SUPPORT +#ifndef USE_JAPANESEUTF8_SUPPORT if (IS_CJK_TTY) { me->current_tag_charset = -1; } else @@ -1640,6 +1646,12 @@ static void SGML_character(HTStream *me, int c_in) c = UCH(c_in); clong = UCH(c); +#if 0 + CTRACE((tfp, "%s:%d PUTC %02x %c\n", + LYCharSet_UC[me->inUCLYhndl].MIMEname, me->T.do_cjk, c, (c > 32 && + c < 127) + ? c : '#')); +#endif if (me->T.decode_utf8) { switch (HTDecodeUTF8(&(me->U), &c_in, &clong)) { case dUTF8_ok: @@ -1665,7 +1677,7 @@ static void SGML_character(HTStream *me, int c_in) /* * If we want the raw input converted to Unicode, try that now. - FM */ -#ifdef EXP_JAPANESEUTF8_SUPPORT +#ifdef USE_JAPANESEUTF8_SUPPORT /* Convert ISO-2022-JP to Unicode (charset=iso-2022-jp is unrecognized) */ #define IS_JIS7_HILO(c) (0x20<(c)&&(c)<0x7F) if (UTF8_TTY_ISO2022JP && (me->state == S_nonascii_text @@ -1698,18 +1710,18 @@ static void SGML_character(HTStream *me, int c_in) } goto top1; } -#endif /* EXP_JAPANESEUTF8_SUPPORT */ +#endif /* USE_JAPANESEUTF8_SUPPORT */ +#ifdef USE_JAPANESEUTF8_SUPPORT if (me->T.trans_to_uni && -#ifdef EXP_JAPANESEUTF8_SUPPORT ((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-jp") == 0) || (strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "shift_jis") == 0))) { if (strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "shift_jis") == 0) { if (me->U.utf_count == 0) { - if (IS_SJIS_HI1((unsigned char) c) || - IS_SJIS_HI2((unsigned char) c)) { + if (IS_SJIS_HI1(c) || + IS_SJIS_HI2(c)) { me->U.utf_buf[0] = (char) c; me->U.utf_count = 1; - clong = -11; + clong = ucCannotConvert; } else if (IS_SJIS_X0201KANA(c)) { if (conv_jisx0201kana) { JISx0201TO0208_SJIS(c, @@ -1721,7 +1733,7 @@ static void SGML_character(HTStream *me, int c_in) } } } else { - if (IS_SJIS_LO((unsigned char) c)) { + if (IS_SJIS_LO(c)) { me->U.utf_buf[1] = (char) c; clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl); } @@ -1729,13 +1741,13 @@ static void SGML_character(HTStream *me, int c_in) } } else { if (me->U.utf_count == 0) { - if (IS_EUC_HI((unsigned char) c) || c == 0x8E) { + if (IS_EUC_HI(c) || c == 0x8E) { me->U.utf_buf[0] = (char) c; me->U.utf_count = 1; - clong = -11; + clong = ucCannotConvert; } } else { - if (IS_EUC_LOX((unsigned char) c)) { + if (IS_EUC_LOX(c)) { me->U.utf_buf[1] = (char) c; clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl); } @@ -1743,12 +1755,45 @@ static void SGML_character(HTStream *me, int c_in) } } goto top1; - } else if (me->T.trans_to_uni && -#endif /* EXP_JAPANESEUTF8_SUPPORT */ + } else +#endif /* USE_JAPANESEUTF8_SUPPORT */ +#ifdef EXP_CHINESEUTF8_SUPPORT + if (me->T.trans_to_uni && + ((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-cn") == 0))) { + if (me->U.utf_count == 0) { + if (IS_GBK_HI(c) || + IS_GBK_HI(c)) { + me->U.utf_buf[0] = (char) c; + me->U.utf_count = 1; + clong = ucCannotConvert; + CTRACE((tfp, "Get EUC-CN: 0x%02X\n", c & 0xff)); + } + } else { + if (IS_GBK_LO(c)) { + me->U.utf_buf[1] = (char) c; + clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl); + if (clong > 0) { + CTRACE((tfp, "... second: [%02X%02X] U+%04lX\n", + me->U.utf_buf[0] & 0xff, + me->U.utf_buf[1] & 0xff, + clong)); + } else { + CTRACE((tfp, "... second: [%02X%02X] %ld\n", + me->U.utf_buf[0] & 0xff, + me->U.utf_buf[1] & 0xff, + clong)); + } + } + me->U.utf_count = 0; + } + goto top1; + } else +#endif /* EXP_CHINESEUTF8_SUPPORT */ + if (me->T.trans_to_uni && /* S/390 -- gil -- 0744 */ - ((TOASCII(clong) >= LYlowest_eightbit[me->inUCLYhndl]) || - (clong < ' ' && clong != 0 && - me->T.trans_C0_to_uni))) { + ((TOASCII(clong) >= LYlowest_eightbit[me->inUCLYhndl]) || + (clong < ' ' && clong != 0 && + me->T.trans_C0_to_uni))) { /* * Convert the octet to Unicode. - FM */ @@ -1890,7 +1935,7 @@ static void SGML_character(HTStream *me, int c_in) */ if ((HTCJK == JAPANESE) && (me->state == S_in_kanji) && !IS_JAPANESE_2BYTE(me->kanji_buf, UCH(c)) -#ifdef EXP_JAPANESEUTF8_SUPPORT +#ifdef USE_JAPANESEUTF8_SUPPORT && !me->T.decode_utf8 #endif ) { @@ -1944,9 +1989,22 @@ static void SGML_character(HTStream *me, int c_in) } /* FALLTHRU */ case S_text: - if (IS_CJK_TTY && ((TOASCII(c) & 0200) != 0) -#ifdef EXP_JAPANESEUTF8_SUPPORT - && !me->T.decode_utf8 +#ifdef EXP_CHINESEUTF8_SUPPORT + if (IS_CJK_TTY && + !strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-cn")) { + /* + * Leave the case statement if we have not collected both of the + * bytes for the EUC-CN character. If we have, then continue on + * to convert it to Unicode. + */ + if (clong == ucCannotConvert) { + break; + } + } else +#endif + if (IS_CJK_TTY && ((TOASCII(c) & 0200) != 0) +#ifdef USE_JAPANESEUTF8_SUPPORT + && !me->T.decode_utf8 #endif ) { /* S/390 -- gil -- 0864 */ /* @@ -2461,8 +2519,6 @@ static void SGML_character(HTStream *me, int c_in) #ifdef USE_PRETTYSRC entity_string = string->data; #endif - /* S/390 -- gil -- 1039 */ - /* CTRACE((tfp, "%s: %d: %s\n", __FILE__, __LINE__, string->data)); */ if (!strcmp(string->data, "zwnj") && (!me->element_stack || (me->element_stack->tag && diff --git a/WWW/Library/Implementation/UCDefs.h b/WWW/Library/Implementation/UCDefs.h index 7555beaa..4eb7c566 100644 --- a/WWW/Library/Implementation/UCDefs.h +++ b/WWW/Library/Implementation/UCDefs.h @@ -1,5 +1,5 @@ /* - * $LynxId: UCDefs.h,v 1.17 2009/03/10 20:02:44 tom Exp $ + * $LynxId: UCDefs.h,v 1.18 2021/06/29 00:21:51 tom Exp $ * * Definitions for Unicode character-translations */ @@ -46,7 +46,7 @@ typedef enum { #define UCT_REP_SUPERSETOF_LAT1 0x02 #define UCT_REP_IS_LAT1 UCT_REP_SUBSETOF_LAT1 | UCT_REP_SUPERSETOF_LAT1 /* - * Assume everything we deal with is included in the UCS2 reperoire, + * Assume everything we deal with is included in the UCS2 repertoire, * so a flag for _REP_SUBSETOF_UCS2 would be redundant. */ diff --git a/WWW/Library/Implementation/UCMap.h b/WWW/Library/Implementation/UCMap.h index 9018aa1b..d9feb44c 100644 --- a/WWW/Library/Implementation/UCMap.h +++ b/WWW/Library/Implementation/UCMap.h @@ -1,5 +1,5 @@ /* - * $LynxId: UCMap.h,v 1.28 2014/12/07 14:40:40 tom Exp $ + * $LynxId: UCMap.h,v 1.29 2021/06/29 22:01:12 tom Exp $ */ #ifndef UCMAP_H #define UCMAP_H @@ -50,7 +50,7 @@ extern "C" { int charset_in, int charset_out, int chk_single_flag); -#ifdef EXP_JAPANESEUTF8_SUPPORT +#ifdef USE_JAPANESEUTF8_SUPPORT extern UCode_t UCTransJPToUni(char *inbuf, int buflen, int charset_in); |