#ifdef EXP_CHARTRANS #include "HTUtils.h" #include "tcp.h" #include "HTCJK.h" #include "UCDefs.h" #include "HTStream.h" #include "UCAux.h" extern HTCJKlang HTCJK; extern LYUCcharset LYCharSet_UC[]; PUBLIC BOOL UCCanUniTranslateFrom ARGS1(int, from) { if (from < 0) return NO; if (LYCharSet_UC[from].enc == UCT_ENC_7BIT || LYCharSet_UC[from].enc == UCT_ENC_UTF8) return YES; if (LYCharSet_UC[from].codepoints & (UCT_CP_SUBSETOF_LAT1)) return YES; return (LYCharSet_UC[from].UChndl >= 0); } PUBLIC BOOL UCCanTranslateUniTo ARGS1(int, to) { if (to < 0) return NO; return YES; /* well at least some characters... */ } PUBLIC BOOL UCCanTranslateFromTo ARGS2(int, from, int, to) { if (from==to) return YES; if (from < 0 || to < 0) return NO; if (from==0) return UCCanTranslateUniTo(to); if (to==0) return UCCanUniTranslateFrom(from); if (LYCharSet_UC[to].enc == UCT_ENC_UTF8) { return (LYCharSet_UC[from].UChndl >= 0); } { char * fromname = LYCharSet_UC[from].MIMEname; char * toname = LYCharSet_UC[to].MIMEname; if (0==strcmp(fromname,"x-transparent") || 0==strcmp(toname,"x-transparent")) { return YES; } if (LYCharSet_UC[from].enc == UCT_ENC_CJK) { if (HTCJK == NOCJK) /* use that global flag, for now */ return NO; if (HTCJK == JAPANESE && ( 0==strcmp(fromname,"euc-jp") || 0==strncmp(fromname,"iso-2022-jp",11) || 0==strcmp(fromname,"shift_jis") )) return YES; return NO; /* if not handled by (from==to) above */ } if (0==strcmp(fromname,"koi8-r")) { /* will try to uses stripping of high bit... */ return YES; } if (0==strcmp(fromname,"koi8-r") || /* from cyrillic */ 0==strcmp(fromname,"iso-8859-5") || 0==strcmp(fromname,"koi-8")) { if (0!=strcmp(toname,"iso-8859-5") && 0!=strcmp(toname,"koi8-r") && 0!=strcmp(toname,"iso-8859-2")) return NO; } } return (LYCharSet_UC[from].UChndl >= 0); } /* The idea here is that any stage of the stream pipe which is interested ** in some charset dependent processing will call this function. ** Given input and ouptput charsets, this function will set various flags ** in a UCTransParams structure that _suggest_ to the caller what to do. ** ** Should be called once when a stage starts processing text (and the ** input and output charsets are known), or whenever one of input or ** output charsets has changed (e.g. by SGML.c stage after HTML.c stage ** has processed a META tag). ** The global flags (LYRawMode, HTPassEightBitRaw etc.) are currently ** not taken into account here, it's still up to the caller to do something ** about them. */ PUBLIC void UCSetTransParams ARGS5( UCTransParams *, pT, int, cs_in, CONST LYUCcharset*, p_in, int, cs_out, CONST LYUCcharset*, p_out ) { pT->transp = (0==strcmp(p_in->MIMEname,"x-transparent") || 0==strcmp(p_out->MIMEname,"x-transparent")); if (pT->transp) { pT->do_cjk = FALSE; pT->decode_utf8 = FALSE; pT->output_utf8 = FALSE; /* we may, but won't know about it */ pT->do_8bitraw = TRUE; pT->use_raw_char_in = TRUE; pT->strip_raw_char_in = FALSE; pT->pass_160_173_raw = TRUE; } else { BOOL intm_ucs = FALSE; BOOL use_ucs = FALSE; pT->do_cjk = ((p_in->enc == UCT_ENC_CJK) && (HTCJK != NOCJK)); pT->decode_utf8 = (p_in->enc == UCT_ENC_UTF8); pT->output_utf8 = (p_out->enc == UCT_ENC_UTF8); if (pT->do_cjk) { intm_ucs = FALSE; pT->trans_to_uni = FALSE; use_ucs = FALSE; pT->do_8bitraw = FALSE; pT->pass_160_173_raw = TRUE; pT->use_raw_char_in = FALSE; /* not used for CJK */ pT->trans_from_uni = FALSE; /* not used for CJK */ } else { intm_ucs = ( cs_in == 0 || pT->decode_utf8 || (p_in->codepoints & (UCT_CP_SUBSETOF_LAT1|UCT_CP_SUBSETOF_UCS2)) ); pT->trans_to_uni = (!intm_ucs && UCCanUniTranslateFrom(cs_in)); pT->strip_raw_char_in = ((!intm_ucs || (p_out->enc==UCT_ENC_7BIT) || (p_out->repertoire & UCT_REP_SUBSETOF_LAT1)) && cs_in != cs_out && 0==strcmp(p_in->MIMEname,"koi8-r")); use_ucs = (intm_ucs || pT->trans_to_uni); pT->do_8bitraw = (!use_ucs); pT->pass_160_173_raw = (!use_ucs && !(p_in->like8859 & UCT_R_8859SPECL) ); pT->use_raw_char_in = (!pT->output_utf8 && cs_in == cs_out); pT->trans_from_uni = (use_ucs && !pT->do_8bitraw && !pT->use_raw_char_in && UCCanTranslateUniTo(cs_out)); } } } PUBLIC void UCTransParams_clear ARGS1( UCTransParams *, pT) { pT->transp = FALSE; pT->do_cjk = FALSE; pT->decode_utf8 = FALSE; pT->output_utf8 = FALSE; pT->do_8bitraw = FALSE; pT->use_raw_char_in = FALSE; pT->strip_raw_char_in = FALSE; pT->pass_160_173_raw = FALSE; pT->trans_to_uni = FALSE; pT->trans_from_uni = FALSE; } /* Given an output target HTStream* (can also be a HTStructured* via typecast), ** the target stream's put_character method, and a unicode character, ** CPutUtf8_charstring() will either output the UTF8 encoding of the unicode ** and return YES, or do nothing and return NO (if conversion would be ** unnecessary or the unicode character is considered invalid). ** ** [Could be used more generally, but is currently only used for &#nnnnn ** stuff - generation of UTF8 from 8-bit encoded charsets not yet done ** by SGML.c etc.] */ #define PUTC(ch) ((*myPutc)(target, (char)(ch))) #define PUTC2(ch) ((*myPutc)(target,(char)(0x80|(0x3f &(ch))))) PUBLIC BOOL UCPutUtf8_charstring ARGS3( HTStream *, target, putc_func_t *, myPutc, long, code) { if (code < 128) return NO; /* indicate to caller we didn't handle it */ else if (code < 0x800L) { PUTC(0xc0 | (code>>6)); PUTC2(code); } else if (code < 0x10000L) { PUTC(0xe0 | (code>>12)); PUTC2(code>>6); PUTC2(code); } else if (code < 0x200000L) { PUTC(0xf0 | (code>>18)); PUTC2(code>>12); PUTC2(code>>6); PUTC2(code); } else if (code < 0x4000000L) { PUTC(0xf8 | (code>>24)); PUTC2(code>>18); PUTC2(code>>12); PUTC2(code>>6); PUTC2(code); } else if (code<=0x7fffffffL) { PUTC(0xfc | (code>>30)); PUTC2(code>>24); PUTC2(code>>18); PUTC2(code>>12); PUTC2(code>>6); PUTC2(code); } else return NO; return YES; } #endif /* EXP_CHARTRANS */