6 files changed, 109 insertions, 45 deletions
diff --git a/WWW/Library/Implementation/HTCJK.h b/WWW/Library/Implementation/HTCJK.h
index 06d816f5..11a43c52 100644
--- a/WWW/Library/Implementation/HTCJK.h
+++ b/WWW/Library/Implementation/HTCJK.h
@@ -1,5 +1,5 @@
 /*
- * $LynxId: HTCJK.h,v 1.19 2012/08/15 23:11:03 tom Exp $
+ * $LynxId: HTCJK.h,v 1.21 2021/06/30 17:16:36 tom Exp $
  *
  *			CJK character converter		HTCJK.h
  *			=======================
@@ -32,21 +32,29 @@ extern "C" {
 #define TO_HANJI	"\033$A"
 #define TO_HANGUL	"\033$(C"
 #define TO_ASCII	"\033(B"
-#define IS_SJIS_LO(lo)	((0x40<=lo)&&(lo!=0x7F)&&(lo<=0xFC))
+
+#define IS_GBK_LO(lo)	((0xA1 <= (lo)) && ((lo) <= 0xFE))
+#define IS_GBK_HI(hi)	((0xA1 <= (hi)) && ((hi) <= 0xF7))
+
+#define IS_SJIS_LO(lo)	((0x40 <= (lo)) && ((lo) != 0x7F) && ((lo) <= 0xFC))
 #define IS_SJIS_HI1(hi) ((0x81 <= (hi)) && ((hi) <= 0x9F))	/* 1st lev. */
 #define IS_SJIS_HI2(hi) ((0xE0 <= (hi)) && ((hi) <= 0xEF))	/* 2nd lev. */
-#define IS_SJIS(hi,lo,in_sjis) (!IS_SJIS_LO(lo)?0:IS_SJIS_HI1(hi)?(in_sjis=1):in_sjis&&IS_SJIS_HI2(hi))
-#define IS_SJIS_2BYTE(hi,lo) (IS_SJIS_LO(lo)&&(IS_SJIS_HI1(hi)||IS_SJIS_HI2(hi)))
-#define IS_SJIS_X0201KANA(lo) ((0xA1<=lo)&&(lo<=0xDF))
-#define IS_EUC_LOX(lo)	((0xA1<=lo)&&(lo<=0xFE))	/* extended */
-#define IS_EUC_HI(hi)	((0xA1<=hi)&&(hi<=0xFE))
-#define IS_EUC_X0201KANA(hi,lo) ((hi==0x8E)&&(0xA1<=lo)&&(lo<=0xDF))
-#define IS_EUC(hi,lo) ((IS_EUC_HI(hi) && IS_EUC_LOX(lo))||IS_EUC_X0201KANA(hi,lo))
+#define IS_SJIS(hi,lo,in_sjis) (!IS_SJIS_LO(lo) ? 0 : IS_SJIS_HI1(hi) ? (in_sjis=1) : in_sjis && IS_SJIS_HI2(hi))
+#define IS_SJIS_2BYTE(hi,lo)   (IS_SJIS_LO(lo) && (IS_SJIS_HI1(hi) || IS_SJIS_HI2(hi)))
+#define IS_SJIS_X0201KANA(lo)  ((0xA1 <= (lo)) && ((lo) <= 0xDF))
+
+#define IS_EUC_LOX(lo)	((0xA1 <= (lo)) && ((lo) <= 0xFE))	/* extended */
+#define IS_EUC_HI(hi)	((0xA1 <= (hi)) && ((hi) <= 0xFE))
+#define IS_EUC_X0201KANA(hi,lo) (((hi) == 0x8E) && (0xA1 <= (lo)) && ((lo) <= 0xDF))
+#define IS_EUC(hi,lo) ((IS_EUC_HI(hi) && IS_EUC_LOX(lo)) || IS_EUC_X0201KANA(hi,lo))
+
 #define IS_JAPANESE_2BYTE(hi,lo) (IS_SJIS_2BYTE(hi,lo) || IS_EUC(hi,lo))
-#define IS_BIG5_LOS(lo)	((0x40<=lo)&&(lo<=0x7E))	/* standard */
-#define IS_BIG5_LOX(lo)	((0xA1<=lo)&&(lo<=0xFE))	/* extended */
-#define IS_BIG5_HI(hi)	((0xA1<=hi)&&(hi<=0xFE))
-#define IS_BIG5(hi,lo) (IS_BIG5_HI(hi) && (IS_BIG5_LOS(lo) || IS_BIG5_LOX(lo)))
+
+#define IS_BIG5_LOS(lo)	((0x40 <= (lo)) && ((lo) <= 0x7E))	/* standard */
+#define IS_BIG5_LOX(lo)	((0xA1 <= (lo)) && ((lo) <= 0xFE))	/* extended */
+#define IS_BIG5_HI(hi)	((0xA1 <= (hi)) && ((hi) <= 0xFE))
+#define IS_BIG5(hi,lo)	(IS_BIG5_HI(hi) && (IS_BIG5_LOS(lo) || IS_BIG5_LOX(lo)))
+
     typedef enum {
 	NOKANJI = 0, EUC, SJIS, JIS
     } HTkcode;
diff --git a/WWW/Library/Implementation/HTMIME.c b/WWW/Library/Implementation/HTMIME.c
index d8fe9978..cce691c2 100644
--- a/WWW/Library/Implementation/HTMIME.c
+++ b/WWW/Library/Implementation/HTMIME.c
@@ -1,5 +1,5 @@
 /*
- * $LynxId: HTMIME.c,v 1.100 2018/03/11 21:32:38 tom Exp $
+ * $LynxId: HTMIME.c,v 1.101 2021/06/29 22:01:12 tom Exp $
  *
  *			MIME Message Parse			HTMIME.c
  *			==================
@@ -389,7 +389,7 @@ static int pumpData(HTStream *me)
 						UCT_SETBY_DEFAULT);
 		    }
 		    if ((p_in->enc != UCT_ENC_CJK)
-#ifdef EXP_JAPANESEUTF8_SUPPORT
+#ifdef USE_JAPANESEUTF8_SUPPORT
 			&& ((p_in->enc != UCT_ENC_UTF8)
 			    || (p_out->enc != UCT_ENC_CJK))
 #endif
diff --git a/WWW/Library/Implementation/HTUtils.h b/WWW/Library/Implementation/HTUtils.h
index 5aedc2f4..d01d0ddd 100644
--- a/WWW/Library/Implementation/HTUtils.h
+++ b/WWW/Library/Implementation/HTUtils.h
@@ -1,5 +1,5 @@
 /*
- * $LynxId: HTUtils.h,v 1.133 2021/06/09 22:17:19 tom Exp $
+ * $LynxId: HTUtils.h,v 1.134 2021/06/29 22:01:12 tom Exp $
  *
  * Utility macros for the W3 code library
  * MACROS FOR GENERAL USE
@@ -145,7 +145,7 @@ char *alloca();
 #endif
 
 #ifndef HAVE_ICONV
-#undef EXP_JAPANESEUTF8_SUPPORT
+#undef USE_JAPANESEUTF8_SUPPORT
 #endif
 
 #ifndef lynx_srand
diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c
index 7f107f44..7d1d44a8 100644
--- a/WWW/Library/Implementation/SGML.c
+++ b/WWW/Library/Implementation/SGML.c
@@ -1,5 +1,5 @@
 /*
- * $LynxId: SGML.c,v 1.169 2020/01/21 22:06:39 tom Exp $
+ * $LynxId: SGML.c,v 1.172 2021/06/30 20:25:01 tom Exp $
  *
  *			General SGML Parser code		SGML.c
  *			========================
@@ -38,6 +38,12 @@
 # include <LYPrettySrc.h>
 #endif
 
+/* a global variable doesn't work with info-stages which convert encoding */
+#if defined(EXP_CHINESEUTF8_SUPPORT)
+#undef IS_CJK_TTY
+#define IS_CJK_TTY me->T.do_cjk
+#endif
+
 #define AssumeCP1252(me) \
 	(((me)->inUCLYhndl == LATIN1 \
 	  || (me)->inUCLYhndl == US_ASCII) \
@@ -93,7 +99,7 @@ static void fake_put_character(HTStream *p GCC_UNUSED,
 /*the following macros are used for pretty source view. */
 #define IS_C(attr) (attr.type == HTMLA_CLASS)
 
-#if defined(EXP_JAPANESEUTF8_SUPPORT)
+#if defined(USE_JAPANESEUTF8_SUPPORT)
 # define UTF8_TTY_ISO2022JP (me->T.output_utf8)
 #else
 # define UTF8_TTY_ISO2022JP 0
@@ -402,7 +408,7 @@ static void set_chartrans_handling(HTStream *me,
      * would be better to call a Lynx_HTML_parser function to set an element in
      * its HTStructured object, itself, if this were needed.  - FM
      */
-#ifndef EXP_JAPANESEUTF8_SUPPORT
+#ifndef USE_JAPANESEUTF8_SUPPORT
     if (IS_CJK_TTY) {
 	me->current_tag_charset = -1;
     } else
@@ -1640,6 +1646,12 @@ static void SGML_character(HTStream *me, int c_in)
     c = UCH(c_in);
     clong = UCH(c);
 
+#if 0
+    CTRACE((tfp, "%s:%d PUTC %02x %c\n",
+	    LYCharSet_UC[me->inUCLYhndl].MIMEname, me->T.do_cjk, c, (c > 32 &&
+								     c < 127)
+	    ? c : '#'));
+#endif
     if (me->T.decode_utf8) {
 	switch (HTDecodeUTF8(&(me->U), &c_in, &clong)) {
 	case dUTF8_ok:
@@ -1665,7 +1677,7 @@ static void SGML_character(HTStream *me, int c_in)
     /*
      * If we want the raw input converted to Unicode, try that now.  - FM
      */
-#ifdef EXP_JAPANESEUTF8_SUPPORT
+#ifdef USE_JAPANESEUTF8_SUPPORT
     /* Convert ISO-2022-JP to Unicode (charset=iso-2022-jp is unrecognized) */
 #define IS_JIS7_HILO(c) (0x20<(c)&&(c)<0x7F)
     if (UTF8_TTY_ISO2022JP && (me->state == S_nonascii_text
@@ -1698,18 +1710,18 @@ static void SGML_character(HTStream *me, int c_in)
 	}
 	goto top1;
     }
-#endif /* EXP_JAPANESEUTF8_SUPPORT */
+#endif /* USE_JAPANESEUTF8_SUPPORT */
+#ifdef USE_JAPANESEUTF8_SUPPORT
     if (me->T.trans_to_uni &&
-#ifdef EXP_JAPANESEUTF8_SUPPORT
 	((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-jp") == 0) ||
 	 (strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "shift_jis") == 0))) {
 	if (strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "shift_jis") == 0) {
 	    if (me->U.utf_count == 0) {
-		if (IS_SJIS_HI1((unsigned char) c) ||
-		    IS_SJIS_HI2((unsigned char) c)) {
+		if (IS_SJIS_HI1(c) ||
+		    IS_SJIS_HI2(c)) {
 		    me->U.utf_buf[0] = (char) c;
 		    me->U.utf_count = 1;
-		    clong = -11;
+		    clong = ucCannotConvert;
 		} else if (IS_SJIS_X0201KANA(c)) {
 		    if (conv_jisx0201kana) {
 			JISx0201TO0208_SJIS(c,
@@ -1721,7 +1733,7 @@ static void SGML_character(HTStream *me, int c_in)
 		    }
 		}
 	    } else {
-		if (IS_SJIS_LO((unsigned char) c)) {
+		if (IS_SJIS_LO(c)) {
 		    me->U.utf_buf[1] = (char) c;
 		    clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
 		}
@@ -1729,13 +1741,13 @@ static void SGML_character(HTStream *me, int c_in)
 	    }
 	} else {
 	    if (me->U.utf_count == 0) {
-		if (IS_EUC_HI((unsigned char) c) || c == 0x8E) {
+		if (IS_EUC_HI(c) || c == 0x8E) {
 		    me->U.utf_buf[0] = (char) c;
 		    me->U.utf_count = 1;
-		    clong = -11;
+		    clong = ucCannotConvert;
 		}
 	    } else {
-		if (IS_EUC_LOX((unsigned char) c)) {
+		if (IS_EUC_LOX(c)) {
 		    me->U.utf_buf[1] = (char) c;
 		    clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
 		}
@@ -1743,12 +1755,45 @@ static void SGML_character(HTStream *me, int c_in)
 	    }
 	}
 	goto top1;
-    } else if (me->T.trans_to_uni &&
-#endif /* EXP_JAPANESEUTF8_SUPPORT */
+    } else
+#endif /* USE_JAPANESEUTF8_SUPPORT */
+#ifdef EXP_CHINESEUTF8_SUPPORT
+	if (me->T.trans_to_uni &&
+	    ((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-cn") == 0))) {
+	if (me->U.utf_count == 0) {
+	    if (IS_GBK_HI(c) ||
+		IS_GBK_HI(c)) {
+		me->U.utf_buf[0] = (char) c;
+		me->U.utf_count = 1;
+		clong = ucCannotConvert;
+		CTRACE((tfp, "Get EUC-CN: 0x%02X\n", c & 0xff));
+	    }
+	} else {
+	    if (IS_GBK_LO(c)) {
+		me->U.utf_buf[1] = (char) c;
+		clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
+		if (clong > 0) {
+		    CTRACE((tfp, "... second: [%02X%02X] U+%04lX\n",
+			    me->U.utf_buf[0] & 0xff,
+			    me->U.utf_buf[1] & 0xff,
+			    clong));
+		} else {
+		    CTRACE((tfp, "... second: [%02X%02X] %ld\n",
+			    me->U.utf_buf[0] & 0xff,
+			    me->U.utf_buf[1] & 0xff,
+			    clong));
+		}
+	    }
+	    me->U.utf_count = 0;
+	}
+	goto top1;
+    } else
+#endif /* EXP_CHINESEUTF8_SUPPORT */
+	if (me->T.trans_to_uni &&
 	/* S/390 -- gil -- 0744 */
-	       ((TOASCII(clong) >= LYlowest_eightbit[me->inUCLYhndl]) ||
-		(clong < ' ' && clong != 0 &&
-		 me->T.trans_C0_to_uni))) {
+	    ((TOASCII(clong) >= LYlowest_eightbit[me->inUCLYhndl]) ||
+	     (clong < ' ' && clong != 0 &&
+	      me->T.trans_C0_to_uni))) {
 	/*
 	 * Convert the octet to Unicode.  - FM
 	 */
@@ -1890,7 +1935,7 @@ static void SGML_character(HTStream *me, int c_in)
      */
     if ((HTCJK == JAPANESE) && (me->state == S_in_kanji) &&
 	!IS_JAPANESE_2BYTE(me->kanji_buf, UCH(c))
-#ifdef EXP_JAPANESEUTF8_SUPPORT
+#ifdef USE_JAPANESEUTF8_SUPPORT
 	&& !me->T.decode_utf8
 #endif
 	) {
@@ -1944,9 +1989,22 @@ static void SGML_character(HTStream *me, int c_in)
 	}
 	/* FALLTHRU */
     case S_text:
-	if (IS_CJK_TTY && ((TOASCII(c) & 0200) != 0)
-#ifdef EXP_JAPANESEUTF8_SUPPORT
-	    && !me->T.decode_utf8
+#ifdef EXP_CHINESEUTF8_SUPPORT
+	if (IS_CJK_TTY &&
+	    !strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-cn")) {
+	    /*
+	     * Leave the case statement if we have not collected both of the
+	     * bytes for the EUC-CN character.  If we have, then continue on
+	     * to convert it to Unicode.
+	     */
+	    if (clong == ucCannotConvert) {
+		break;
+	    }
+	} else
+#endif
+	    if (IS_CJK_TTY && ((TOASCII(c) & 0200) != 0)
+#ifdef USE_JAPANESEUTF8_SUPPORT
+		&& !me->T.decode_utf8
 #endif
 	    ) {			/* S/390 -- gil -- 0864 */
 	    /*
@@ -2461,8 +2519,6 @@ static void SGML_character(HTStream *me, int c_in)
 #ifdef USE_PRETTYSRC
 	    entity_string = string->data;
 #endif
-	    /* S/390 -- gil -- 1039 */
-	    /* CTRACE((tfp, "%s: %d: %s\n", __FILE__, __LINE__, string->data)); */
 	    if (!strcmp(string->data, "zwnj") &&
 		(!me->element_stack ||
 		 (me->element_stack->tag &&
diff --git a/WWW/Library/Implementation/UCDefs.h b/WWW/Library/Implementation/UCDefs.h
index 7555beaa..4eb7c566 100644
--- a/WWW/Library/Implementation/UCDefs.h
+++ b/WWW/Library/Implementation/UCDefs.h
@@ -1,5 +1,5 @@
 /*
- * $LynxId: UCDefs.h,v 1.17 2009/03/10 20:02:44 tom Exp $
+ * $LynxId: UCDefs.h,v 1.18 2021/06/29 00:21:51 tom Exp $
  *
  * Definitions for Unicode character-translations
  */
@@ -46,7 +46,7 @@ typedef enum {
 #define UCT_REP_SUPERSETOF_LAT1 0x02
 #define UCT_REP_IS_LAT1 UCT_REP_SUBSETOF_LAT1 | UCT_REP_SUPERSETOF_LAT1
 /*
- *  Assume everything we deal with is included in the UCS2 reperoire,
+ *  Assume everything we deal with is included in the UCS2 repertoire,
  *  so a flag for _REP_SUBSETOF_UCS2 would be redundant.
  */
 
diff --git a/WWW/Library/Implementation/UCMap.h b/WWW/Library/Implementation/UCMap.h
index 9018aa1b..d9feb44c 100644
--- a/WWW/Library/Implementation/UCMap.h
+++ b/WWW/Library/Implementation/UCMap.h
@@ -1,5 +1,5 @@
 /*
- * $LynxId: UCMap.h,v 1.28 2014/12/07 14:40:40 tom Exp $
+ * $LynxId: UCMap.h,v 1.29 2021/06/29 22:01:12 tom Exp $
  */
 #ifndef UCMAP_H
 #define UCMAP_H
@@ -50,7 +50,7 @@ extern "C" {
 			      int charset_in,
 			      int charset_out,
 			      int chk_single_flag);
-#ifdef EXP_JAPANESEUTF8_SUPPORT
+#ifdef USE_JAPANESEUTF8_SUPPORT
     extern UCode_t UCTransJPToUni(char *inbuf,
 				  int buflen,
 				  int charset_in);