#include "HTUtils.h" #include "tcp.h" #include "HTCJK.h" #include "UCDefs.h" #include "HTStream.h" #include "UCAux.h" extern HTCJKlang HTCJK; extern LYUCcharset LYCharSet_UC[]; PUBLIC UCTQ_t UCCanUniTranslateFrom ARGS1( int, from) { if (from < 0) return TQ_NO; if (LYCharSet_UC[from].enc == UCT_ENC_7BIT || LYCharSet_UC[from].enc == UCT_ENC_UTF8) return TQ_EXCELLENT; if (LYCharSet_UC[from].codepoints & (UCT_CP_SUBSETOF_LAT1)) return TQ_EXCELLENT; return ((LYCharSet_UC[from].UChndl >= 0) ? TQ_GOOD : TQ_NO); } PUBLIC UCTQ_t UCCanTranslateUniTo ARGS1( int, to) { if (to < 0) return TQ_NO; if (LYCharSet_UC[to].enc == UCT_ENC_7BIT) return TQ_POOR; if (LYCharSet_UC[to].enc == UCT_ENC_UTF8) return TQ_EXCELLENT; if (LYCharSet_UC[to].enc == UCT_ENC_CJK) return TQ_POOR; if (LYCharSet_UC[to].UChndl >= 0) return TQ_GOOD; return TQ_GOOD; /* at least some characters, we don't know more */ } PUBLIC UCTQ_t UCCanTranslateFromTo ARGS2( int, from, int, to) { if (from == to) return TQ_EXCELLENT; if (from < 0 || to < 0) return TQ_NO; if (from == 0) return UCCanTranslateUniTo(to); if (to == 0 || LYCharSet_UC[to].enc == UCT_ENC_UTF8) return UCCanUniTranslateFrom(from); { CONST char * fromname = LYCharSet_UC[from].MIMEname; CONST char * toname = LYCharSet_UC[to].MIMEname; UCTQ_t tqmin = TQ_NO, tqmax = TQ_GOOD; if (!strcmp(fromname, "x-transparent") || !strcmp(toname, "x-transparent")) { return TQ_GOOD; } else if (!strcmp(fromname, "us-ascii")) { return TQ_GOOD; } if (LYCharSet_UC[from].enc == UCT_ENC_CJK) { if (HTCJK == NOCJK) /* use that global flag, for now */ return TQ_NO; if (HTCJK == JAPANESE && (!strcmp(fromname, "euc-jp") || !strncmp(fromname, "iso-2022-jp",11) || !strcmp(fromname, "shift_jis"))) return TQ_GOOD; return TQ_NO; /* if not handled by (from == to) above */ } if (!strcmp(fromname, "koi8-r")) { /* * Will try to use stripping of high bit... */ tqmin = TQ_POOR; } if (!strcmp(fromname, "koi8-r") || /* from cyrillic */ !strcmp(fromname, "iso-8859-5") || !strcmp(fromname, "cp866") || !strcmp(fromname, "windows-1251") || !strcmp(fromname, "koi-8")) { if (strcmp(toname, "iso-8859-5") && strcmp(toname, "koi8-r") && strcmp(toname, "cp866") && strcmp(toname, "windows-1251")) tqmax = TQ_POOR; } return ((LYCharSet_UC[from].UChndl >= 0) ? tqmax : tqmin); } } /* Returns YES if no tranlation necessary (because charsets ** are equal, are equivalent, etc.) */ PUBLIC BOOL UCNeedNotTranslate ARGS2(int, from, int, to) { CONST char *fromname; CONST char *toname; if (from==to) return YES; if (from < 0) return NO; /* ??? */ if (LYCharSet_UC[from].enc == UCT_ENC_7BIT) { return YES; /* only 7bit chars */ } fromname = LYCharSet_UC[from].MIMEname; if (0==strcmp(fromname,"x-transparent") || 0==strcmp(fromname,"us-ascii")) { return YES; } if (to < 0) return NO; /* ??? */ if (to==0) { if (LYCharSet_UC[from].codepoints & (UCT_CP_SUBSETOF_LAT1)) return YES; } toname = LYCharSet_UC[to].MIMEname; if (0==strcmp(toname,"x-transparent")) { return YES; } if (LYCharSet_UC[to].enc == UCT_ENC_UTF8) { return NO; } if (from==0) { if (LYCharSet_UC[from].codepoints & (UCT_CP_SUPERSETOF_LAT1)) return YES; } if (LYCharSet_UC[from].enc == UCT_ENC_CJK) { if (HTCJK == NOCJK) /* use that global flag, for now */ return NO; if (HTCJK == JAPANESE && ( 0==strcmp(fromname,"euc-jp") || 0==strncmp(fromname,"iso-2022-jp",11) || 0==strcmp(fromname,"shift_jis") )) return YES; /* ??? */ return NO; /* if not handled by (from==to) above */ } return NO; } /* * The idea here is that any stage of the stream pipe which is interested * in some charset dependent processing will call this function. * Given input and ouptput charsets, this function will set various flags * in a UCTransParams structure that _suggest_ to the caller what to do. * * Should be called once when a stage starts processing text (and the * input and output charsets are known), or whenever one of input or * output charsets has changed (e.g. by SGML.c stage after HTML.c stage * has processed a META tag). * The global flags (LYRawMode, HTPassEightBitRaw etc.) are currently * not taken into account here (except for HTCJK, somewhat), it's still * up to the caller to do something about them. */ PUBLIC void UCSetTransParams ARGS5( UCTransParams *, pT, int, cs_in, CONST LYUCcharset*, p_in, int, cs_out, CONST LYUCcharset*, p_out) { pT->trans_C0_to_uni = FALSE; pT->transp = (!strcmp(p_in->MIMEname, "x-transparent") || !strcmp(p_out->MIMEname, "x-transparent")); if (pT->transp) { pT->do_cjk = FALSE; pT->decode_utf8 = FALSE; pT->output_utf8 = FALSE; /* we may, but won't know about it */ pT->do_8bitraw = TRUE; pT->use_raw_char_in = TRUE; pT->strip_raw_char_in = FALSE; pT->pass_160_173_raw = TRUE; pT->repl_translated_C0 = (p_out->enc == UCT_ENC_8BIT_C0); pT->trans_C0_to_uni = (p_in->enc == UCT_ENC_8BIT_C0 || p_out->enc == UCT_ENC_8BIT_C0); } else { BOOL intm_ucs = FALSE; BOOL use_ucs = FALSE; pT->do_cjk = ((p_in->enc == UCT_ENC_CJK) && (HTCJK != NOCJK)); pT->decode_utf8 = (p_in->enc == UCT_ENC_UTF8); pT->output_utf8 = (p_out->enc == UCT_ENC_UTF8); if (pT->do_cjk) { intm_ucs = FALSE; pT->trans_to_uni = FALSE; use_ucs = FALSE; pT->do_8bitraw = FALSE; pT->pass_160_173_raw = TRUE; pT->use_raw_char_in = FALSE; /* not used for CJK */ pT->repl_translated_C0 = FALSE; pT->trans_from_uni = FALSE; /* not used for CJK */ } else { intm_ucs = (cs_in == 0 || pT->decode_utf8 || (p_in->codepoints & (UCT_CP_SUBSETOF_LAT1|UCT_CP_SUBSETOF_UCS2))); pT->trans_to_uni = (!intm_ucs && UCCanUniTranslateFrom(cs_in)); pT->trans_C0_to_uni = (pT->trans_to_uni && p_in->enc == UCT_ENC_8BIT_C0); pT->repl_translated_C0 = (p_out->enc == UCT_ENC_8BIT_C0); pT->strip_raw_char_in = ((!intm_ucs || (p_out->enc == UCT_ENC_7BIT) || (p_out->repertoire & UCT_REP_SUBSETOF_LAT1)) && cs_in != cs_out && !strcmp(p_in->MIMEname, "koi8-r")); use_ucs = (intm_ucs || pT->trans_to_uni); pT->do_8bitraw = (!use_ucs); pT->pass_160_173_raw = (!use_ucs && !(p_in->like8859 & UCT_R_8859SPECL)); pT->use_raw_char_in = (!pT->output_utf8 && cs_in == cs_out && !pT->trans_C0_to_uni); pT->trans_from_uni = (use_ucs && !pT->do_8bitraw && !pT->use_raw_char_in && UCCanTranslateUniTo(cs_out)); } } } PUBLIC void UCTransParams_clear ARGS1( UCTransParams *, pT) { pT->transp = FALSE; pT->do_cjk = FALSE; pT->decode_utf8 = FALSE; pT->output_utf8 = FALSE; pT->do_8bitraw = FALSE; pT->use_raw_char_in = FALSE; pT->strip_raw_char_in = FALSE; pT->pass_160_173_raw = FALSE; pT->trans_to_uni = FALSE; pT->trans_C0_to_uni = FALSE; pT->repl_translated_C0 = FALSE; pT->trans_from_uni = FALSE; } /* * If terminal is in UTF-8 mode, it probably cannot understand * box drawing chars as (n)curses handles them. (This may also * be true for other display character sets, but isn't currently * checked.) In that case set the chars for hori and vert drawing * chars to displayable ASCII chars if '0' was requested. They'll * stay as they are otherwise. - kw */ PUBLIC void UCSetBoxChars ARGS5( int, cset, int *, pvert_out, int *, phori_out, int, vert_in, int, hori_in) { if (cset >= -1 && LYCharSet_UC[cset].enc == UCT_ENC_UTF8) { *pvert_out = (vert_in ? vert_in : '|'); *phori_out = (hori_in ? hori_in : '-'); } else { *pvert_out = vert_in; *phori_out = hori_in; } } /* * Given an output target HTStream* (can also be a HTStructured* via * typecast), the target stream's put_character method, and a unicode * character, CPutUtf8_charstring() will either output the UTF8 * encoding of the unicode and return YES, or do nothing and return * NO (if conversion would be unnecessary or the unicode character is * considered invalid). * * [Could be used more generally, but is currently only used for &#nnnnn * stuff - generation of UTF8 from 8-bit encoded charsets not yet done * by SGML.c etc.] */ #define PUTC(ch) ((*myPutc)(target, (char)(ch))) #define PUTC2(ch) ((*myPutc)(target,(char)(0x80|(0x3f &(ch))))) PUBLIC BOOL UCPutUtf8_charstring ARGS3( HTStream *, target, putc_func_t *, myPutc, long, code) { if (code < 128) return NO; /* indicate to caller we didn't handle it */ else if (code < 0x800L) { PUTC(0xc0 | (code>>6)); PUTC2(code); } else if (code < 0x10000L) { PUTC(0xe0 | (code>>12)); PUTC2(code>>6); PUTC2(code); } else if (code < 0x200000L) { PUTC(0xf0 | (code>>18)); PUTC2(code>>12); PUTC2(code>>6); PUTC2(code); } else if (code < 0x4000000L) { PUTC(0xf8 | (code>>24)); PUTC2(code>>18); PUTC2(code>>12); PUTC2(code>>6); PUTC2(code); } else if (code <= 0x7fffffffL) { PUTC(0xfc | (code>>30)); PUTC2(code>>24); PUTC2(code>>18); PUTC2(code>>12); PUTC2(code>>6); PUTC2(code); } else return NO; return YES; }