1 files changed, 213 insertions, 0 deletions
diff --git a/src/UCAux.c b/src/UCAux.c
new file mode 100644
index 00000000..cca77ebc
--- /dev/null
+++ b/src/UCAux.c
@@ -0,0 +1,213 @@
+#ifdef EXP_CHARTRANS
+#include "HTUtils.h"
+#include "tcp.h"
+
+#include "HTCJK.h"
+#include "UCDefs.h"
+#include "HTStream.h"
+#include "UCAux.h"
+
+extern HTCJKlang HTCJK;
+extern LYUCcharset LYCharSet_UC[];
+
+PUBLIC BOOL UCCanUniTranslateFrom ARGS1(int, from)
+{
+    if (from < 0)
+	return NO;
+    if (LYCharSet_UC[from].enc == UCT_ENC_7BIT ||
+	LYCharSet_UC[from].enc == UCT_ENC_UTF8)
+	return YES;
+    if (LYCharSet_UC[from].codepoints & (UCT_CP_SUBSETOF_LAT1))
+	return YES;
+    return (LYCharSet_UC[from].UChndl >= 0);
+}
+PUBLIC BOOL UCCanTranslateUniTo ARGS1(int, to)
+{
+    if (to < 0)
+	return NO;
+    return YES;			/* well at least some characters... */
+}
+PUBLIC BOOL UCCanTranslateFromTo ARGS2(int, from, int, to)
+{
+    if (from==to)
+	return YES;
+    if (from < 0 || to < 0)
+	return NO;
+    if (from==0)
+	return UCCanTranslateUniTo(to);
+    if (to==0)
+	return UCCanUniTranslateFrom(from);
+    if (LYCharSet_UC[to].enc == UCT_ENC_UTF8) {
+	return (LYCharSet_UC[from].UChndl >= 0);
+    }
+    {
+	char * fromname = LYCharSet_UC[from].MIMEname;
+	char * toname = LYCharSet_UC[to].MIMEname;
+	if (0==strcmp(fromname,"x-transparent") ||
+	    0==strcmp(toname,"x-transparent")) {
+	    return YES;
+	}
+	if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
+	    if (HTCJK == NOCJK)	/* use that global flag, for now */
+		return NO;
+	    if (HTCJK == JAPANESE && (
+		0==strcmp(fromname,"euc-jp") ||
+		0==strncmp(fromname,"iso-2022-jp",11) ||
+		0==strcmp(fromname,"shift_jis")
+		))
+		return YES;
+	    return NO;	/* if not handled by (from==to) above */
+	}
+	if (0==strcmp(fromname,"koi8-r")) {
+	    			/* will try to uses stripping of high bit... */
+	    return YES;
+	}
+	    
+	if (0==strcmp(fromname,"koi8-r") || /* from cyrillic */
+	    0==strcmp(fromname,"iso-8859-5") ||
+	    0==strcmp(fromname,"koi-8")) {
+	    if (0!=strcmp(toname,"iso-8859-5") &&
+		0!=strcmp(toname,"koi8-r") &&
+		0!=strcmp(toname,"iso-8859-2"))
+		return NO;
+	}
+    }
+    return (LYCharSet_UC[from].UChndl >= 0);
+}
+
+/* The idea here is that any stage of the stream pipe which is interested
+** in some charset dependent processing will call this function.
+** Given input and ouptput charsets, this function will set various flags
+** in a UCTransParams structure that _suggest_ to the caller what to do.
+**
+** Should be called once when a stage starts processing text (and the
+** input and output charsets are known), or whenever one of input or
+** output charsets has changed (e.g. by SGML.c stage after HTML.c stage
+** has processed a META tag).
+** The global flags (LYRawMode, HTPassEightBitRaw etc.) are currently
+** not taken into account here, it's still up to the caller to do something
+** about them.
+*/
+PUBLIC void UCSetTransParams ARGS5(
+    UCTransParams *, 	pT,
+    int,		cs_in,
+    CONST LYUCcharset*,	p_in,
+    int,		cs_out,
+    CONST LYUCcharset*,	p_out
+    )
+{
+    pT->transp = (0==strcmp(p_in->MIMEname,"x-transparent") ||
+		0==strcmp(p_out->MIMEname,"x-transparent"));
+    if (pT->transp) {
+	pT->do_cjk = FALSE;
+	pT->decode_utf8 = FALSE;
+	pT->output_utf8 = FALSE;	/* we may, but won't know about it */
+	pT->do_8bitraw = TRUE;
+	pT->use_raw_char_in = TRUE;
+	pT->strip_raw_char_in = FALSE;
+	pT->pass_160_173_raw = TRUE;
+    } else {
+	BOOL intm_ucs = FALSE;
+	BOOL use_ucs = FALSE;
+	pT->do_cjk = ((p_in->enc == UCT_ENC_CJK) && (HTCJK != NOCJK));
+	pT->decode_utf8 = (p_in->enc == UCT_ENC_UTF8);
+	pT->output_utf8 = (p_out->enc == UCT_ENC_UTF8);
+	if (pT->do_cjk) {
+	    intm_ucs = FALSE;
+	    pT->trans_to_uni = FALSE;
+	    use_ucs = FALSE;
+	    pT->do_8bitraw = FALSE;
+	    pT->pass_160_173_raw = TRUE;
+	    pT->use_raw_char_in = FALSE; /* not used for CJK */
+	    pT->trans_from_uni = FALSE; /* not used for CJK */
+	} else {
+	    intm_ucs = (
+		cs_in == 0 || pT->decode_utf8 ||
+	    (p_in->codepoints & (UCT_CP_SUBSETOF_LAT1|UCT_CP_SUBSETOF_UCS2))
+		);
+	    pT->trans_to_uni = (!intm_ucs &&
+				UCCanUniTranslateFrom(cs_in));
+	    pT->strip_raw_char_in =
+		((!intm_ucs ||
+		  (p_out->enc==UCT_ENC_7BIT) ||
+		  (p_out->repertoire & UCT_REP_SUBSETOF_LAT1)) &&
+		cs_in != cs_out &&
+		0==strcmp(p_in->MIMEname,"koi8-r"));
+	    use_ucs = (intm_ucs || pT->trans_to_uni);
+	    pT->do_8bitraw = (!use_ucs);
+	    pT->pass_160_173_raw = (!use_ucs &&
+				    !(p_in->like8859 & UCT_R_8859SPECL)
+			      );
+	    pT->use_raw_char_in = (!pT->output_utf8 && cs_in == cs_out);
+	    pT->trans_from_uni = (use_ucs && !pT->do_8bitraw &&
+				  !pT->use_raw_char_in &&
+				  UCCanTranslateUniTo(cs_out));
+	}
+    }
+}
+
+PUBLIC void UCTransParams_clear ARGS1(
+    UCTransParams *,    pT)
+{
+	pT->transp = FALSE;
+	pT->do_cjk = FALSE;
+	pT->decode_utf8 = FALSE;
+	pT->output_utf8 = FALSE;
+	pT->do_8bitraw = FALSE;
+	pT->use_raw_char_in = FALSE;
+	pT->strip_raw_char_in = FALSE;
+	pT->pass_160_173_raw = FALSE;
+	pT->trans_to_uni = FALSE;
+	pT->trans_from_uni = FALSE;
+}
+
+/* Given an output target HTStream* (can also be a HTStructured* via typecast),
+** the target stream's put_character method, and a unicode character, 
+** CPutUtf8_charstring() will either output the UTF8 encoding of the unicode
+** and return YES, or do nothing and return NO (if conversion would be
+** unnecessary or the unicode character is considered invalid).
+**
+** [Could be used more generally, but is currently only used for &#nnnnn 
+** stuff - generation of UTF8 from 8-bit encoded charsets not yet done
+** by SGML.c etc.]
+*/
+#define PUTC(ch) ((*myPutc)(target, (char)(ch)))
+#define PUTC2(ch) ((*myPutc)(target,(char)(0x80|(0x3f &(ch)))))
+
+PUBLIC BOOL UCPutUtf8_charstring ARGS3(
+    HTStream *,	target,
+    putc_func_t *,	myPutc,
+    long,	code)
+{
+    if (code < 128)
+	return NO;		/* indicate to caller we didn't handle it */
+    else if   (code <     0x800L) {
+	PUTC(0xc0 | (code>>6));
+	PUTC2(code);
+    } else if (code <   0x10000L) {
+	PUTC(0xe0 | (code>>12));
+	PUTC2(code>>6);
+	PUTC2(code);
+    } else if (code <  0x200000L) {
+	PUTC(0xf0 | (code>>18));
+	PUTC2(code>>12);
+	PUTC2(code>>6);
+	PUTC2(code);
+    } else if (code < 0x4000000L) {
+	PUTC(0xf8 | (code>>24));
+	PUTC2(code>>18);
+	PUTC2(code>>12);
+	PUTC2(code>>6);
+	PUTC2(code);
+    } else if (code<=0x7fffffffL) {
+	PUTC(0xfc | (code>>30));
+	PUTC2(code>>24);
+	PUTC2(code>>18);
+	PUTC2(code>>12);
+	PUTC2(code>>6);
+	PUTC2(code);
+    } else
+	return NO;
+    return YES;
+}
+#endif /* EXP_CHARTRANS */