about summary refs log blame commit diff stats
path: root/src/UCAux.c
blob: e5d01147f281a4ce942859596a30ba4307fc50f6 (plain) (tree)
1
2
3
4
5
6
7
8
9
                    
 
                  
                  


                     
                       
 
                                    

                 
                  
                                

                                              
      

                                                              
 

                    
 
 
                                

               
                  




                                                            
                                                                      
 
 

                                   
 
                   
                   
                           
                  
                       
                                       
                                                             
                                           
     


                                                           

                                                 
                                         
                                                   
                       

                                                    
              






                                                                                
                                                    
                          

                                                 
                                               
                                                 
                           
              


                                                                              
                      
         
     
                                                

 
  




                                                    
 

                         
 
                   



                                                 
                                                      

                                           

                                             
                   


                                         
                       



                                                                   
                                           




                                               
                         



                                                                     
                                                                    
                      
                                
                                           
                                             

                                                                              



              
  
















                                                                          
 
                                                             

                                                                   
 
      


                                                                               
                                

      

                                                                           
                                                                    
                                                                    
 
                     
          

                                                        

                                
                                                                                   



                                      

                                                                        
                                                                     
            
          

                                        

                              
 
          

                                                                       
                                                                             
          

                                                                               

                                                              
                         
              


                                                        




                                        
                                                                            
                                           
                                                                            
                
              




                                                                               
                                                                    

                                                                                
              





                                                                            
                                                   
                                                                     
              



                                                                               
                                                             
                                                                        
              

                                                        
                                                                            
              


                                                            
                                          
              


                                                                           
                                                            
              




                                                                              
                                               
              


                                                                              
                                                      
                                                                                
              



                                                                             
                                                             

                                                                
              






                                                                               
                                                                      

                                                                      



         
  




                                                
 








                                  

                                   
                               
 
 
  






                                                                   
                                       



                                  
 
                    
                                                               
                                               
                                               


          



                             
 
  










                                                                         


                                                               
                                                                            


                                                                            

                                 
                    
                                 

                                  
                    
                                  


                                  

                                   



                                  
                    
                                     




                                  




                    

  








                                                    







                                          
                  


                        

                                                

                                 


                                                     

                                  



                                                      

                                   




                                                      

                   





                                                      



                   

  










                                                                       

                       
                     
                     



                                                                   










                                                             
                                             

                    
                                                 





                                    

                                      

              


                                              

              



                                                

              




                                                  

              





                                                    




                           
#include <HTUtils.h>

#include <HTCJK.h>
#include <UCMap.h>
#include <UCDefs.h>
#include <HTStream.h>
#include <UCAux.h>
#include <LYCharSets.h>

BOOL UCCanUniTranslateFrom(int from)
{
    if (from < 0)
	return NO;
#ifndef EXP_JAPANESEUTF8_SUPPORT
    if (LYCharSet_UC[from].enc == UCT_ENC_CJK)
	return NO;
#endif
    if (!strcmp(LYCharSet_UC[from].MIMEname, "x-transparent"))
	return NO;

    /* others YES */
    return YES;
}

BOOL UCCanTranslateUniTo(int to)
{
    if (to < 0)
	return NO;
/*???
    if (!strcmp(LYCharSet_UC[to].MIMEname, "x-transparent"))
       return NO;
*/

    return YES;			/* well at least some characters... */
}

BOOL UCCanTranslateFromTo(int from,
			  int to)
{
    if (from == to)
	return YES;
    if (from < 0 || to < 0)
	return NO;
    if (from == LATIN1)
	return UCCanTranslateUniTo(to);
    if (to == LATIN1 || LYCharSet_UC[to].enc == UCT_ENC_UTF8)
	return UCCanUniTranslateFrom(from);
    {
	const char *fromname = LYCharSet_UC[from].MIMEname;
	const char *toname = LYCharSet_UC[to].MIMEname;

	if (!strcmp(fromname, "x-transparent") ||
	    !strcmp(toname, "x-transparent")) {
	    return YES;		/* ??? */
	} else if (!strcmp(fromname, "us-ascii")) {
	    return YES;
	}
	if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
	    /*
	     * CJK mode may be off (i.e., HTCJK == NOCJK) because the current
	     * document is not CJK, but the check may be for capability in
	     * relation to another document, for which CJK mode might be turned
	     * on when retrieved.  Thus, when the from charset is CJK, check if
	     * the to charset is CJK, and return NO or YES in relation to that. 
	     * - FM
	     */
	    if (LYCharSet_UC[to].enc != UCT_ENC_CJK)
		return NO;
	    if ((!strcmp(toname, "euc-jp") ||
		 !strcmp(toname, "shift_jis")) &&
		(!strcmp(fromname, "euc-jp") ||
		 !strcmp(fromname, "shift_jis")))
		return YES;
	    /*
	     * The euc-cn and euc-kr charsets were handled by the (from == to)
	     * above, so we need not check those.  - FM
	     */
	    return NO;
	}
    }
    return YES;			/* others YES */
}

/*
 *  Returns YES if no translation necessary (because
 *  charsets are equal, are equivalent, etc.).
 */
BOOL UCNeedNotTranslate(int from,
			int to)
{
    const char *fromname;
    const char *toname;

    if (from == to)
	return YES;
    if (from < 0)
	return NO;		/* ??? */
    if (LYCharSet_UC[from].enc == UCT_ENC_7BIT) {
	return YES;		/* Only 7bit chars. */
    }
    fromname = LYCharSet_UC[from].MIMEname;
    if (!strcmp(fromname, "x-transparent") ||
	!strcmp(fromname, "us-ascii")) {
	return YES;
    }
    if (to < 0)
	return NO;		/* ??? */
    if (to == LATIN1) {
	if (LYCharSet_UC[from].codepoints & (UCT_CP_SUBSETOF_LAT1))
	    return YES;
    }
    toname = LYCharSet_UC[to].MIMEname;
    if (!strcmp(toname, "x-transparent")) {
	return YES;
    }
    if (LYCharSet_UC[to].enc == UCT_ENC_UTF8) {
	return NO;
    }
    if (from == LATIN1) {
	if (LYCharSet_UC[from].codepoints & (UCT_CP_SUPERSETOF_LAT1))
	    return YES;
    }
    if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
	if (HTCJK == NOCJK)	/* Use that global flag, for now. */
	    return NO;
	if (HTCJK == JAPANESE &&
	    (!strcmp(fromname, "euc-jp") ||
	     !strcmp(fromname, "shift_jis")))
	    return YES;		/* translate internally by lynx, no unicode */
	return NO;		/* If not handled by (from == to) above. */
    }
    return NO;
}

/*
 *  The idea here is that any stage of the stream pipe which is interested
 *  in some charset dependent processing will call this function.
 *  Given input and output charsets, this function will set various flags
 *  in a UCTransParams structure that _suggest_ to the caller what to do.
 *
 *  Should be called once when a stage starts processing text (and the
 *  input and output charsets are known), or whenever one of input or
 *  output charsets has changed (e.g., by SGML.c stage after HTML.c stage
 *  has processed a META tag).
 *  The global flags (LYRawMode, HTPassEightBitRaw etc.) are currently
 *  not taken into account here (except for HTCJK, somewhat), it's still
 *  up to the caller to do something about them. - KW
 */
void UCSetTransParams(UCTransParams * pT, int cs_in,
		      const LYUCcharset *p_in,
		      int cs_out,
		      const LYUCcharset *p_out)
{
    CTRACE((tfp, "UCSetTransParams: from %s(%d) to %s(%d)\n",
	    p_in->MIMEname, UCGetLYhndl_byMIME(p_in->MIMEname),
	    p_out->MIMEname, UCGetLYhndl_byMIME(p_out->MIMEname)));

    /*
     * Initialize this element to FALSE, and set it TRUE below if we're dealing
     * with VISCII.  - FM
     */
    pT->trans_C0_to_uni = FALSE;

    /*
     * The "transparent" display character set is a "super raw mode".  - FM
     */
    pT->transp = (BOOL) (!strcmp(p_in->MIMEname, "x-transparent") ||
			 !strcmp(p_out->MIMEname, "x-transparent"));

    if (pT->transp) {
	/*
	 * Set up the structure for "transparent".  - FM
	 */
	pT->do_cjk = FALSE;
	pT->decode_utf8 = FALSE;
	pT->output_utf8 = FALSE;	/* We may, but won't know about it. - KW */
	pT->do_8bitraw = TRUE;
	pT->use_raw_char_in = TRUE;
	pT->strip_raw_char_in = FALSE;
	pT->pass_160_173_raw = TRUE;
	pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
	pT->trans_C0_to_uni = (BOOL) (p_in->enc == UCT_ENC_8BIT_C0 ||
				      p_out->enc == UCT_ENC_8BIT_C0);
    } else {
	/*
	 * Initialize local flags.  - FM
	 */
	BOOL intm_ucs = FALSE;
	BOOL use_ucs = FALSE;

	/*
	 * Set this element if we want to treat the input as CJK.  - FM
	 */
	pT->do_cjk = (BOOL) ((p_in->enc == UCT_ENC_CJK) && (HTCJK != NOCJK));
	/*
	 * Set these elements based on whether we are dealing with UTF-8.  - FM
	 */
	pT->decode_utf8 = (BOOL) (p_in->enc == UCT_ENC_UTF8);
	pT->output_utf8 = (BOOL) (p_out->enc == UCT_ENC_UTF8);
	if (pT->do_cjk) {
	    /*
	     * Set up the structure for a CJK input with
	     * a CJK output (HTCJK != NOCJK).  - FM
	     */
	    intm_ucs = FALSE;
	    pT->trans_to_uni = FALSE;
	    use_ucs = FALSE;
	    pT->do_8bitraw = FALSE;
	    pT->pass_160_173_raw = TRUE;
	    pT->use_raw_char_in = FALSE;	/* Not used for CJK. - KW */
	    pT->repl_translated_C0 = FALSE;
	    pT->trans_from_uni = FALSE;		/* Not used for CJK. - KW */
	} else {
	    /*
	     * Set up for all other charset combinations.  The intm_ucs flag is
	     * set TRUE if the input charset is iso-8859-1 or UTF-8, or largely
	     * equivalent to them, i.e., if we have UCS without having to do a
	     * table translation.
	     */
	    intm_ucs = (BOOL) (cs_in == LATIN1 || pT->decode_utf8 ||
			       (p_in->codepoints &
				(UCT_CP_SUBSETOF_LAT1 | UCT_CP_SUBSETOF_UCS2)));
	    /*
	     * pT->trans_to_uni is set TRUE if we do not have that as input
	     * already, and we can translate to Unicode.  Note that UTF-8
	     * always is converted to Unicode in functions that use the
	     * transformation structure, so it is treated as already Unicode
	     * here.
	     */
	    pT->trans_to_uni = (BOOL) (!intm_ucs &&
				       UCCanUniTranslateFrom(cs_in));
	    /*
	     * We set this if we are translating to Unicode and what normally
	     * are low value control characters in fact are encoding octets for
	     * the input charset (presently, this applies to VISCII).  - FM
	     */
	    pT->trans_C0_to_uni = (BOOL) (pT->trans_to_uni &&
					  p_in->enc == UCT_ENC_8BIT_C0);
	    /*
	     * We set this, presently, for VISCII.  - FM
	     */
	    pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
	    /*
	     * Currently unused for any charset combination.
	     * Should always be FALSE
	     */
	    pT->strip_raw_char_in = FALSE;
	    /*
	     * use_ucs should be set TRUE if we have or will create Unicode
	     * values for input octets or UTF multibytes.  - FM
	     */
	    use_ucs = (BOOL) (intm_ucs || pT->trans_to_uni);
	    /*
	     * This is set TRUE if use_ucs was set FALSE.  It is complementary
	     * to the HTPassEightBitRaw flag, which is set TRUE or FALSE
	     * elsewhere based on the raw mode setting in relation to the
	     * current Display Character Set.  - FM
	     */
	    pT->do_8bitraw = (BOOL) (!use_ucs);
	    /*
	     * This is set TRUE when 160 and 173 should not be treated as nbsp
	     * and shy, respectively.  - FM
	     */
	    pT->pass_160_173_raw = (BOOL) (!use_ucs &&
					   !(p_in->like8859 & UCT_R_8859SPECL));
	    /*
	     * This is set when the input and output charsets match, and they
	     * are not ones which should go through a Unicode translation
	     * process anyway.  - FM
	     */
	    pT->use_raw_char_in = (BOOL) (!pT->output_utf8 &&
					  cs_in == cs_out &&
					  !pT->trans_C0_to_uni);
	    /*
	     * This should be set TRUE when we expect to have done translation
	     * to Unicode or had the equivalent as input, can translate it to
	     * our output charset, and normally want to do so.  The latter
	     * depends on the pT->do_8bitraw and pT->use_raw_char_in values set
	     * above, but also on HTPassEightBitRaw in any functions which use
	     * the transformation structure..  - FM
	     */
	    pT->trans_from_uni = (BOOL) (use_ucs && !pT->do_8bitraw &&
					 !pT->use_raw_char_in &&
					 UCCanTranslateUniTo(cs_out));
	}
    }
}

/*
 *  This function initializes the transformation
 *  structure by setting all its elements to
 *  FALSE. - KW
 */
void UCTransParams_clear(UCTransParams * pT)
{
    pT->transp = FALSE;
    pT->do_cjk = FALSE;
    pT->decode_utf8 = FALSE;
    pT->output_utf8 = FALSE;
    pT->do_8bitraw = FALSE;
    pT->use_raw_char_in = FALSE;
    pT->strip_raw_char_in = FALSE;
    pT->pass_160_173_raw = FALSE;
    pT->trans_to_uni = FALSE;
    pT->trans_C0_to_uni = FALSE;
    pT->repl_translated_C0 = FALSE;
    pT->trans_from_uni = FALSE;
}

/*
 *  If terminal is in UTF-8 mode, it probably cannot understand
 *  box drawing chars as (n)curses handles them.  (This may also
 *  be true for other display character sets, but isn't currently
 *  checked.)  In that case set the chars for hori and vert drawing
 *  chars to displayable ASCII chars if '0' was requested.  They'll
 *  stay as they are otherwise. - kw
 */
void UCSetBoxChars(int cset GCC_UNUSED,
		   int *pvert_out,
		   int *phori_out,
		   int vert_in,
		   int hori_in)
{
#ifndef WIDEC_CURSES
    if (cset >= -1 && LYCharSet_UC[cset].enc == UCT_ENC_UTF8) {
	*pvert_out = (vert_in ? vert_in : '|');
	*phori_out = (hori_in ? hori_in : '-');
    } else
#endif
    {
	*pvert_out = vert_in;
	*phori_out = hori_in;
    }
}

/*
 *  Given an output target HTStream* (can also be a HTStructured* via
 *  typecast), the target stream's put_character method, and a Unicode
 *  character,  CPutUtf8_charstring() will either output the UTF8
 *  encoding of the Unicode and return YES, or do nothing and return
 *  NO (if conversion would be unnecessary or the Unicode character is
 *  considered invalid).
 *
 *  [Could be used more generally, but is currently only used for &#nnnnn
 *  stuff - generation of UTF8 from 8-bit encoded charsets not yet done
 *  by SGML.c etc.]
 */
#define PUTC(ch) ((*myPutc)(target, (char)(ch)))
#define PUTC2(ch) ((*myPutc)(target,(char)(0x80|(0x3f &(ch)))))

BOOL UCPutUtf8_charstring(HTStream *target, putc_func_t * myPutc, long code)
{
    if (code < 128)
	return NO;		/* indicate to caller we didn't handle it */
    else if (code < 0x800L) {
	PUTC(0xc0 | (code >> 6));
	PUTC2(code);
    } else if (code < 0x10000L) {
	PUTC(0xe0 | (code >> 12));
	PUTC2(code >> 6);
	PUTC2(code);
    } else if (code < 0x200000L) {
	PUTC(0xf0 | (code >> 18));
	PUTC2(code >> 12);
	PUTC2(code >> 6);
	PUTC2(code);
    } else if (code < 0x4000000L) {
	PUTC(0xf8 | (code >> 24));
	PUTC2(code >> 18);
	PUTC2(code >> 12);
	PUTC2(code >> 6);
	PUTC2(code);
    } else if (code <= 0x7fffffffL) {
	PUTC(0xfc | (code >> 30));
	PUTC2(code >> 24);
	PUTC2(code >> 18);
	PUTC2(code >> 12);
	PUTC2(code >> 6);
	PUTC2(code);
    } else
	return NO;
    return YES;
}

/*
 *  This function converts a Unicode (UCode_t) value
 *  to a multibyte UTF-8 character, which is loaded
 *  into the buffer received as an argument.  The
 *  buffer should be large enough to hold at least
 *  seven characters (but should be declared as 8
 *  to minimize byte alignment problems with some
 *  compilers). - FM
 */
BOOL UCConvertUniToUtf8(UCode_t code, char *buffer)
{
    char *ch = buffer;

    if (!ch)
	return NO;

    if (code <= 0 || code > 0x7fffffffL) {
	*ch = '\0';
	return NO;
    }

    if (code < 0x800L) {
	*ch++ = (char) (0xc0 | (code >> 6));
	*ch++ = (char) (0x80 | (0x3f & (code)));
	*ch = '\0';
    } else if (code < 0x10000L) {
	*ch++ = (char) (0xe0 | (code >> 12));
	*ch++ = (char) (0x80 | (0x3f & (code >> 6)));
	*ch++ = (char) (0x80 | (0x3f & (code)));
	*ch = '\0';
    } else if (code < 0x200000L) {
	*ch++ = (char) (0xf0 | (code >> 18));
	*ch++ = (char) (0x80 | (0x3f & (code >> 12)));
	*ch++ = (char) (0x80 | (0x3f & (code >> 6)));
	*ch++ = (char) (0x80 | (0x3f & (code)));
	*ch = '\0';
    } else if (code < 0x4000000L) {
	*ch++ = (char) (0xf8 | (code >> 24));
	*ch++ = (char) (0x80 | (0x3f & (code >> 18)));
	*ch++ = (char) (0x80 | (0x3f & (code >> 12)));
	*ch++ = (char) (0x80 | (0x3f & (code >> 6)));
	*ch++ = (char) (0x80 | (0x3f & (code)));
	*ch = '\0';
    } else {
	*ch++ = (char) (0xfc | (code >> 30));
	*ch++ = (char) (0x80 | (0x3f & (code >> 24)));
	*ch++ = (char) (0x80 | (0x3f & (code >> 18)));
	*ch++ = (char) (0x80 | (0x3f & (code >> 12)));
	*ch++ = (char) (0x80 | (0x3f & (code >> 6)));
	*ch++ = (char) (0x80 | (0x3f & (code)));
	*ch = '\0';
    }
    return YES;
}

/*
 * Get UCS character code for one character from UTF-8 encoded string.
 *
 * On entry:
 *	*ppuni should point to beginning of UTF-8 encoding character
 * On exit:
 *	*ppuni is advanced to point to the last byte of UTF-8 sequence,
 *		if there was a valid one; otherwise unchanged.
 * returns the UCS value
 * returns negative value on error (invalid UTF-8 sequence)
 */
UCode_t UCGetUniFromUtf8String(char **ppuni)
{
    UCode_t uc_out = 0;
    char *p = *ppuni;
    int utf_count, i;

    if (!(**ppuni & 0x80))
	return (UCode_t) **ppuni;	/* ASCII range character */
    else if (!(**ppuni & 0x40))
	return (-1);		/* not a valid UTF-8 start */
    if ((*p & 0xe0) == 0xc0) {
	utf_count = 1;
    } else if ((*p & 0xf0) == 0xe0) {
	utf_count = 2;
    } else if ((*p & 0xf8) == 0xf0) {
	utf_count = 3;
    } else if ((*p & 0xfc) == 0xf8) {
	utf_count = 4;
    } else if ((*p & 0xfe) == 0xfc) {
	utf_count = 5;
    } else {			/* garbage */
	return (-1);
    }
    for (p = *ppuni, i = 0; i < utf_count; i++) {
	if ((*(++p) & 0xc0) != 0x80)
	    return (-1);
    }
    p = *ppuni;
    switch (utf_count) {
    case 1:
	uc_out = (((*p & 0x1f) << 6) |
		  (*(p + 1) & 0x3f));
	break;
    case 2:
	uc_out = (((((*p & 0x0f) << 6) |
		    (*(p + 1) & 0x3f)) << 6) |
		  (*(p + 2) & 0x3f));
	break;
    case 3:
	uc_out = (((((((*p & 0x07) << 6) |
		      (*(p + 1) & 0x3f)) << 6) |
		    (*(p + 2) & 0x3f)) << 6) |
		  (*(p + 3) & 0x3f));
	break;
    case 4:
	uc_out = (((((((((*p & 0x03) << 6) |
			(*(p + 1) & 0x3f)) << 6) |
		      (*(p + 2) & 0x3f)) << 6) |
		    (*(p + 3) & 0x3f)) << 6) |
		  (*(p + 4) & 0x3f));
	break;
    case 5:
	uc_out = (((((((((((*p & 0x01) << 6) |
			  (*(p + 1) & 0x3f)) << 6) |
			(*(p + 2) & 0x3f)) << 6) |
		      (*(p + 3) & 0x3f)) << 6) |
		    (*(p + 4) & 0x3f)) << 6) |
		  (*(p + 5) & 0x3f));
	break;
    }
    *ppuni = p + utf_count;
    return uc_out;
}