about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--CHANGES7
-rw-r--r--WWW/Library/Implementation/SGML.c166
-rw-r--r--src/UCdomap.c14
3 files changed, 158 insertions, 29 deletions
diff --git a/CHANGES b/CHANGES
index 67e8ddce..da7678f0 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,9 +1,14 @@
--- $LynxId: CHANGES,v 1.1002 2018/12/28 22:03:14 tom Exp $
+-- $LynxId: CHANGES,v 1.1003 2018/12/29 00:36:37 tom Exp $
 ===============================================================================
 Changes since Lynx 2.8 release
 ===============================================================================
 
 2018-12-28 (2.9.0dev.1)
+* add to experimental Japanese UTF-8 feature -KH
+  + fix JIS X 0201 Katakana conversion from Shift_JIS/EUC-JP to UTF-8.
+    http://www1.interq.or.jp/~deton/lynx-sjisjisx0201/
+  + add conversion from Japanese ISO-2022-JP html to UTF-8 display_charset.
+    http://www1.interq.or.jp/~deton/lynx-jis2utf8/
 * add configure option for experimental feature "--enable-wcwidth-support" -TD
 * use/adapt wcwidth from xterm -TD
 * add support for displaying double-cell characters -KH
diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c
index 193e68dd..f9c2c8bc 100644
--- a/WWW/Library/Implementation/SGML.c
+++ b/WWW/Library/Implementation/SGML.c
@@ -1,5 +1,5 @@
 /*
- * $LynxId: SGML.c,v 1.163 2018/05/15 21:55:21 tom Exp $
+ * $LynxId: SGML.c,v 1.165 2018/12/29 00:50:51 tom Exp $
  *
  *			General SGML Parser code		SGML.c
  *			========================
@@ -93,6 +93,12 @@ static void fake_put_character(HTStream *p GCC_UNUSED,
 /*the following macros are used for pretty source view. */
 #define IS_C(attr) (attr.type == HTMLA_CLASS)
 
+#if defined(EXP_JAPANESEUTF8_SUPPORT)
+# define UTF8_TTY_ISO2022JP (me->T.output_utf8)
+#else
+# define UTF8_TTY_ISO2022JP 0
+#endif
+
 HTCJKlang HTCJK = NOCJK;	/* CJK enum value.              */
 BOOL HTPassEightBitRaw = FALSE;	/* Pass 161-172,174-255 raw.    */
 BOOL HTPassEightBitNum = FALSE;	/* Pass ^ numeric entities raw. */
@@ -786,7 +792,8 @@ static void handle_entity(HTStream *me, int term)
     if (psrc_view)
 	PSRCSTART(badseq);
 #endif
-    CTRACE((tfp, "SGML: Unknown entity '%s' %" PRI_UCode_t " %ld\n", s, code, uck));	/* S/390 -- gil -- 0695 */
+    /* S/390 -- gil -- 0695 */
+    CTRACE((tfp, "SGML: Unknown entity '%s' %" PRI_UCode_t " %ld\n", s, code, uck));
     PUTC('&');
     PUTS(s);
     if (term != '\0')
@@ -1659,6 +1666,38 @@ static void SGML_character(HTStream *me, int c_in)
     /*
      * If we want the raw input converted to Unicode, try that now.  - FM
      */
+    /* Convert ISO-2022-JP to Unicode (charset=iso-2022-jp is unrecognized) */
+#define IS_JIS7_HILO(c) (0x20<(c)&&(c)<0x7F)
+    if (UTF8_TTY_ISO2022JP && (me->state == S_nonascii_text
+			       || me->state == S_nonascii_text_sq
+			       || me->state == S_nonascii_text_dq)) {
+	/* end of ISO-2022-JP? || not in ISO-2022-JP range */
+	if (TOASCII(c) == '\033' || !IS_JIS7_HILO(c)) {
+	    me->kanji_buf = '\0';
+	    goto top1;
+	}
+	if (me->kanji_buf == '\t') {	/* flag for single byte kana in "ESC(I" */
+	    if (conv_jisx0201kana) {
+		JISx0201TO0208_SJIS(c | 0200,
+				    (unsigned char *) me->U.utf_buf,
+				    (unsigned char *) me->U.utf_buf + 1);
+		clong = UCTransJPToUni(me->U.utf_buf, 2,
+				       UCGetLYhndl_byMIME("shift_jis"));
+	    } else {
+		clong = UCTransToUni(c | 0200, UCGetLYhndl_byMIME("shift_jis"));
+	    }
+	} else if (me->kanji_buf) {
+	    me->U.utf_buf[0] = (char) (me->kanji_buf | 0200);	/* to EUC-JP */
+	    me->U.utf_buf[1] = (char) (c | 0200);
+	    clong = UCTransJPToUni(me->U.utf_buf, 2,
+				   UCGetLYhndl_byMIME("euc-jp"));
+	    me->kanji_buf = '\0';
+	} else {
+	    me->kanji_buf = c;
+	    clong = ucNeedMore;
+	}
+	goto top1;
+    }
     if (me->T.trans_to_uni &&
 #ifdef EXP_JAPANESEUTF8_SUPPORT
 	((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-jp") == 0) ||
@@ -1670,6 +1709,15 @@ static void SGML_character(HTStream *me, int c_in)
 		    me->U.utf_buf[0] = (char) c;
 		    me->U.utf_count = 1;
 		    clong = -11;
+		} else if (IS_SJIS_X0201KANA(c)) {
+		    if (conv_jisx0201kana) {
+			JISx0201TO0208_SJIS(c,
+					    (unsigned char *) me->U.utf_buf,
+					    (unsigned char *) me->U.utf_buf + 1);
+			clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
+		    } else {
+			clong = UCTransToUni(c, me->inUCLYhndl);
+		    }
 		}
 	    } else {
 		if (IS_SJIS_LO((unsigned char) c)) {
@@ -1680,7 +1728,7 @@ static void SGML_character(HTStream *me, int c_in)
 	    }
 	} else {
 	    if (me->U.utf_count == 0) {
-		if (IS_EUC_HI((unsigned char) c)) {
+		if (IS_EUC_HI((unsigned char) c) || c == 0x8E) {
 		    me->U.utf_buf[0] = (char) c;
 		    me->U.utf_count = 1;
 		    clong = -11;
@@ -1696,7 +1744,8 @@ static void SGML_character(HTStream *me, int c_in)
 	goto top1;
     } else if (me->T.trans_to_uni &&
 #endif
-	       ((TOASCII(clong) >= LYlowest_eightbit[me->inUCLYhndl]) ||	/* S/390 -- gil -- 0744 */
+	/* S/390 -- gil -- 0744 */
+	       ((TOASCII(clong) >= LYlowest_eightbit[me->inUCLYhndl]) ||
 		(clong < ' ' && clong != 0 &&
 		 me->T.trans_C0_to_uni))) {
 	/*
@@ -1801,7 +1850,8 @@ static void SGML_character(HTStream *me, int c_in)
      */
     if (TOASCII(clong) < 32 &&
 	c != '\t' && c != '\n' && c != '\r' &&
-	!IS_CJK_TTY)
+	!IS_CJK_TTY &&
+	!(UTF8_TTY_ISO2022JP && TOASCII(c) == '\033'))
 	goto after_switch;
 
     /*
@@ -1909,13 +1959,15 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_in_kanji;
 	    me->kanji_buf = c;
 	    break;
-	} else if (IS_CJK_TTY && TOASCII(c) == '\033') {	/* S/390 -- gil -- 0881 */
+	} else if ((IS_CJK_TTY || UTF8_TTY_ISO2022JP) && TOASCII(c) == '\033') {
+	    /* S/390 -- gil -- 0881 */
 	    /*
 	     * Setting up for CJK escape sequence handling (based on Takuya
 	     * ASADA's (asada@three-a.co.jp) CJK Lynx).  - FM
 	     */
 	    me->state = S_esc;
-	    PUTC(c);
+	    if (!UTF8_TTY_ISO2022JP)
+		PUTC(c);
 	    break;
 	}
 
@@ -3642,7 +3694,8 @@ static void SGML_character(HTStream *me, int c_in)
 	     * - Takuya ASADA (asada@three-a.co.jp)
 	     */
 	    me->state = S_esc_sq;
-	    HTChunkPutc(string, c);
+	    if (!UTF8_TTY_ISO2022JP)
+		HTChunkPutc(string, c);
 	} else if (me->T.decode_utf8 &&
 		   *me->U.utf_buf) {
 	    HTChunkPuts(string, me->U.utf_buf);
@@ -3686,7 +3739,8 @@ static void SGML_character(HTStream *me, int c_in)
 	     * - Takuya ASADA (asada@three-a.co.jp)
 	     */
 	    me->state = S_esc_dq;
-	    HTChunkPutc(string, c);
+	    if (!UTF8_TTY_ISO2022JP)
+		HTChunkPutc(string, c);
 	} else if (me->T.decode_utf8 &&
 		   *me->U.utf_buf) {
 	    HTChunkPuts(string, me->U.utf_buf);
@@ -3949,8 +4003,11 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_paren;
 	} else {
 	    me->state = S_text;
+	    if (UTF8_TTY_ISO2022JP)
+		goto top1;
 	}
-	PUTC(c);
+	if (!UTF8_TTY_ISO2022JP)
+	    PUTC(c);
 	break;
 
     case S_dollar:		/* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
@@ -3959,7 +4016,8 @@ static void SGML_character(HTStream *me, int c_in)
 	} else if (c == '(') {
 	    me->state = S_dollar_paren;
 	}
-	PUTC(c);
+	if (!UTF8_TTY_ISO2022JP)
+	    PUTC(c);
 	break;
 
     case S_dollar_paren:	/* Expecting 'C' after CJK "ESC$(". */
@@ -3967,8 +4025,13 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_nonascii_text;
 	} else {
 	    me->state = S_text;
+	    if (UTF8_TTY_ISO2022JP) {
+		PUTS("$(");
+		goto top1;
+	    }
 	}
-	PUTC(c);
+	if (!UTF8_TTY_ISO2022JP)
+	    PUTC(c);
 	break;
 
     case S_paren:		/* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
@@ -3976,19 +4039,30 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_text;
 	} else if (c == 'I') {
 	    me->state = S_nonascii_text;
+	    if (UTF8_TTY_ISO2022JP)
+		me->kanji_buf = '\t';	/* flag for single byte katakana */
 	} else {
 	    me->state = S_text;
+	    if (UTF8_TTY_ISO2022JP) {
+		PUTC('(');
+		goto top1;
+	    }
 	}
-	PUTC(c);
+	if (!UTF8_TTY_ISO2022JP)
+	    PUTC(c);
 	break;
 
     case S_nonascii_text:	/* Expecting CJK ESC after non-ASCII text. */
 	if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1264 */
 	    me->state = S_esc;
-	}
-	PUTC(c);
-	if (c < 32)
+	} else if (c < 32) {
 	    me->state = S_text;
+	}
+	if (UTF8_TTY_ISO2022JP) {
+	    if (TOASCII(c) != '\033')
+		PUTUTF8(clong);
+	} else
+	    PUTC(c);
 	break;
 
     case S_esc_sq:		/* Expecting '$'or '(' following CJK ESC. */
@@ -3998,8 +4072,11 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_paren_sq;
 	} else {
 	    me->state = S_squoted;
+	    if (UTF8_TTY_ISO2022JP)
+		goto top1;
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_dollar_sq:		/* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
@@ -4008,7 +4085,8 @@ static void SGML_character(HTStream *me, int c_in)
 	} else if (c == '(') {
 	    me->state = S_dollar_paren_sq;
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_dollar_paren_sq:	/* Expecting 'C' after CJK "ESC$(". */
@@ -4016,8 +4094,13 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_nonascii_text_sq;
 	} else {
 	    me->state = S_squoted;
+	    if (UTF8_TTY_ISO2022JP) {
+		HTChunkPuts(string, "$(");
+		goto top1;
+	    }
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_paren_sq:		/* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
@@ -4025,17 +4108,28 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_squoted;
 	} else if (c == 'I') {
 	    me->state = S_nonascii_text_sq;
+	    if (UTF8_TTY_ISO2022JP)
+		me->kanji_buf = '\t';	/* flag for single byte katakana */
 	} else {
 	    me->state = S_squoted;
+	    if (UTF8_TTY_ISO2022JP) {
+		HTChunkPutc(string, '(');
+		goto top1;
+	    }
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_nonascii_text_sq:	/* Expecting CJK ESC after non-ASCII text. */
 	if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1281 */
 	    me->state = S_esc_sq;
 	}
-	HTChunkPutc(string, c);
+	if (UTF8_TTY_ISO2022JP) {
+	    if (TOASCII(c) != '\033')
+		HTChunkPutUtf8Char(string, clong);
+	} else
+	    HTChunkPutc(string, c);
 	break;
 
     case S_esc_dq:		/* Expecting '$'or '(' following CJK ESC. */
@@ -4045,8 +4139,11 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_paren_dq;
 	} else {
 	    me->state = S_dquoted;
+	    if (UTF8_TTY_ISO2022JP)
+		goto top1;
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_dollar_dq:		/* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
@@ -4055,7 +4152,8 @@ static void SGML_character(HTStream *me, int c_in)
 	} else if (c == '(') {
 	    me->state = S_dollar_paren_dq;
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_dollar_paren_dq:	/* Expecting 'C' after CJK "ESC$(". */
@@ -4063,8 +4161,13 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_nonascii_text_dq;
 	} else {
 	    me->state = S_dquoted;
+	    if (UTF8_TTY_ISO2022JP) {
+		HTChunkPuts(string, "$(");
+		goto top1;
+	    }
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_paren_dq:		/* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
@@ -4072,17 +4175,28 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_dquoted;
 	} else if (c == 'I') {
 	    me->state = S_nonascii_text_dq;
+	    if (UTF8_TTY_ISO2022JP)
+		me->kanji_buf = '\t';	/* flag for single byte katakana */
 	} else {
 	    me->state = S_dquoted;
+	    if (UTF8_TTY_ISO2022JP) {
+		HTChunkPutc(string, '(');
+		goto top1;
+	    }
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_nonascii_text_dq:	/* Expecting CJK ESC after non-ASCII text. */
 	if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1298 */
 	    me->state = S_esc_dq;
 	}
-	HTChunkPutc(string, c);
+	if (UTF8_TTY_ISO2022JP) {
+	    if (TOASCII(c) != '\033')
+		HTChunkPutUtf8Char(string, clong);
+	} else
+	    HTChunkPutc(string, c);
 	break;
 
     case S_junk_tag:
diff --git a/src/UCdomap.c b/src/UCdomap.c
index 7c2ef7c5..ddac1876 100644
--- a/src/UCdomap.c
+++ b/src/UCdomap.c
@@ -1,5 +1,5 @@
 /*
- * $LynxId: UCdomap.c,v 1.103 2017/02/08 01:23:33 tom Exp $
+ * $LynxId: UCdomap.c,v 1.104 2018/12/29 00:20:33 Kihara.Hideto Exp $
  *
  *  UCdomap.c
  *  =========
@@ -1217,6 +1217,16 @@ UCode_t UCTransToUni(int ch_in,
 		    buffer[0] = (char) ch_in;
 		    inx = 1;
 		    return ucNeedMore;
+		} else if (IS_SJIS_X0201KANA(ch_iu)) {
+		    buffer[0] = (char) ch_in;
+		    buffer[1] = 0;
+		    cd = iconv_open("UTF-16BE", "Shift_JIS");
+		    ilen = 1;
+		    (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen);
+		    iconv_close(cd);
+		    if ((ilen == 0) && (olen == 0)) {
+			return (UCH(obuffer[0]) << 8) + UCH(obuffer[1]);
+		    }
 		}
 	    } else {
 		if (IS_SJIS_LO(ch_iu)) {
@@ -1235,7 +1245,7 @@ UCode_t UCTransToUni(int ch_in,
 	}
 	if (strcmp(LYCharSet_UC[charset_in].MIMEname, "euc-jp") == 0) {
 	    if (inx == 0) {
-		if (IS_EUC_HI(ch_iu)) {
+		if (IS_EUC_HI(ch_iu) || ch_iu == 0x8E) {
 		    buffer[0] = (char) ch_in;
 		    inx = 1;
 		    return ucNeedMore;