src/TRSTable.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

#ifndef TRSTABLE_H
#define TRSTABLE_H

/* TRST_MAXCOLSPAN and TRST_MAXCOLSPAN are defined in userdefs.h */

typedef struct _STable_info STable_info;
extern STable_info * Stbl_startTABLE PARAMS((short));
extern int Stbl_finishTABLE PARAMS((STable_info *));
extern void Stbl_free PARAMS((STable_info *));
extern int Stbl_addRowToTable PARAMS((STable_info *, short, int));
extern int Stbl_addCellToTable PARAMS((STable_info *, int, int, short, BOOL, int, int));
extern int Stbl_finishCellInTable PARAMS((STable_info *, BOOL, int, int));
extern int Stbl_addColInfo PARAMS((STable_info *, int, short, BOOL));
extern int Stbl_finishColGroup PARAMS((STable_info *));
extern int Stbl_addRowGroup PARAMS((STable_info *, short));
#define Stbl_lineBreak(stbl,l,pos) Stbl_finishCellInTable(stbl, NO, l, pos)
extern int Stbl_getStartLine PARAMS((STable_info *));
extern int Stbl_getFixupPositions PARAMS((
    STable_info *	me,
    int			lineno,
    int *		oldpos,
    int *		newpos));
extern short Stbl_getAlignment PARAMS((STable_info *));

#endif /* TRSTABLE_H */
#include <HTUtils.h>

#include <HTCJK.h>
#include <UCMap.h>
#include <UCDefs.h>
#include <HTStream.h>
#include <UCAux.h>

extern HTCJKlang HTCJK;
extern LYUCcharset LYCharSet_UC[];

PUBLIC BOOL UCCanUniTranslateFrom ARGS1(
	int,		from)
{
    if (from < 0)
	return NO;
    if (LYCharSet_UC[from].enc == UCT_ENC_CJK)
	return NO;
    if (!strcmp(LYCharSet_UC[from].MIMEname, "x-transparent"))
	return NO;

    /* others YES */
    return YES;
}

PUBLIC BOOL UCCanTranslateUniTo ARGS1(
	int,		to)
{
    if (to < 0)
	return NO;
/*???
    if (!strcmp(LYCharSet_UC[to].MIMEname, "x-transparent"))
       return NO;
*/

    return YES;			/* well at least some characters... */
}

PUBLIC BOOL UCCanTranslateFromTo ARGS2(
	int,		from,
	int,		to)
{
    if (from == to)
	return YES;
    if (from < 0 || to < 0)
	return NO;
    if (from == LATIN1)
	return UCCanTranslateUniTo(to);
    if (to == LATIN1 || LYCharSet_UC[to].enc == UCT_ENC_UTF8)
	return UCCanUniTranslateFrom(from);
    {
	CONST char * fromname = LYCharSet_UC[from].MIMEname;
	CONST char * toname = LYCharSet_UC[to].MIMEname;
	if (!strcmp(fromname, "x-transparent") ||
	    !strcmp(toname, "x-transparent")) {
	    return YES; /* ??? */
	} else if (!strcmp(fromname, "us-ascii")) {
	    return YES;
	}
	if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
	    /*
	    **  CJK mode may be off (i.e., HTCJK == NOCJK) because
	    **  the current document is not CJK, but the check may
	    **  be for capability in relation to another document,
	    **  for which CJK mode might be turned on when retrieved.
	    **  Thus, when the from charset is CJK, check if the to
	    **  charset is CJK, and return NO or YES in relation to
	    **  that. - FM
	    */
	    if (LYCharSet_UC[to].enc != UCT_ENC_CJK)
		return NO;
	    if ((!strcmp(toname, "euc-jp") ||
		 !strcmp(toname, "shift_jis")) &&
		(!strcmp(fromname, "euc-jp") ||
		 !strcmp(fromname, "shift_jis")))
		return YES;
	    /*
	    **  The euc-cn and euc-kr charsets were handled
	    **  by the (from == to) above, so we need not
	    **  check those. - FM
	    **/
	    return NO;
	}
    }
    return YES;    /* others YES */
}

/*
**  Returns YES if no translation necessary (because
**  charsets are equal, are equivalent, etc.).
*/
PUBLIC BOOL UCNeedNotTranslate ARGS2(
	int,		from,
	int,		to)
{
    CONST char *fromname;
    CONST char *toname;
    if (from == to)
	return YES;
    if (from < 0)
	return NO;		/* ??? */
    if (LYCharSet_UC[from].enc == UCT_ENC_7BIT) {
	return YES;		/* Only 7bit chars. */
    }
    fromname = LYCharSet_UC[from].MIMEname;
    if (!strcmp(fromname, "x-transparent") ||
	!strcmp(fromname, "us-ascii")) {
	    return YES;
    }
    if (to < 0)
	return NO;		/* ??? */
    if (to == LATIN1) {
	if (LYCharSet_UC[from].codepoints & (UCT_CP_SUBSETOF_LAT1))
	    return YES;
    }
    toname = LYCharSet_UC[to].MIMEname;
    if (!strcmp(toname, "x-transparent")) {
	return YES;
    }
    if (LYCharSet_UC[to].enc == UCT_ENC_UTF8) {
	return NO;
    }
    if (from == LATIN1) {
	if (LYCharSet_UC[from].codepoints & (UCT_CP_SUPERSETOF_LAT1))
	    return YES;
    }
    if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
	if (HTCJK == NOCJK)	/* Use that global flag, for now. */
	    return NO;
	if (HTCJK == JAPANESE &&
	    (!strcmp(fromname, "euc-jp") ||
	     !strcmp(fromname, "shift_jis")))
	    return YES;	/* translate internally by lynx, no unicode */
	return NO;	/* If not handled by (from == to) above. */
    }
    return NO;
}

/*
**  The idea here is that any stage of the stream pipe which is interested
**  in some charset dependent processing will call this function.
**  Given input and output charsets, this function will set various flags
**  in a UCTransParams structure that _suggest_ to the caller what to do.
**
**  Should be called once when a stage starts processing text (and the
**  input and output charsets are known), or whenever one of input or
**  output charsets has changed (e.g., by SGML.c stage after HTML.c stage
**  has processed a META tag).
**  The global flags (LYRawMode, HTPassEightBitRaw etc.) are currently
**  not taken into account here (except for HTCJK, somewhat), it's still
**  up to the caller to do something about them. - KW
*/
PUBLIC void UCSetTransParams ARGS5(
    UCTransParams *,	pT,
    int,		cs_in,
    CONST LYUCcharset*,	p_in,
    int,		cs_out,
    CONST LYUCcharset*,	p_out)
{
    CTRACE((tfp, "UCSetTransParams: from %s(%d) to %s(%d)\n",
	   p_in->MIMEname,  UCGetLYhndl_byMIME(p_in->MIMEname),
	   p_out->MIMEname, UCGetLYhndl_byMIME(p_out->MIMEname)));

    /*
    **  Initialize this element to FALSE, and set it TRUE
    **  below if we're dealing with VISCII. - FM
    */
    pT->trans_C0_to_uni = FALSE;

    /*
    **  The "transparent" display character set is a
    **  "super raw mode". - FM
    */
    pT->transp = (BOOL) (!strcmp(p_in->MIMEname, "x-transparent") ||
		  !strcmp(p_out->MIMEname, "x-transparent"));

    if (pT->transp) {
	/*
	**  Set up the structure for "transparent". - FM
	*/
	pT->do_cjk = FALSE;
	pT->decode_utf8 = FALSE;
	pT->output_utf8 = FALSE;  /* We may, but won't know about it. - KW */
	pT->do_8bitraw = TRUE;
	pT->use_raw_char_in = TRUE;
	pT->strip_raw_char_in = FALSE;
	pT->pass_160_173_raw = TRUE;
	pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
	pT->trans_C0_to_uni = (BOOL) (p_in->enc == UCT_ENC_8BIT_C0 ||
			       p_out->enc == UCT_ENC_8BIT_C0);
    } else {
	/*
	**  Initialize local flags. - FM
	*/
	BOOL intm_ucs = FALSE;
	BOOL use_ucs = FALSE;
	/*
	**  Set this element if we want to treat
	**  the input as CJK. - FM
	*/
	pT->do_cjk = (BOOL) ((p_in->enc == UCT_ENC_CJK) && (HTCJK != NOCJK));
	/*
	**  Set these elements based on whether
	**  we are dealing with UTF-8. - FM
	*/
	pT->decode_utf8 = (BOOL) (p_in->enc == UCT_ENC_UTF8);
	pT->output_utf8 = (BOOL) (p_out->enc == UCT_ENC_UTF8);
	if (pT->do_cjk) {
	    /*
	    **  Set up the structure for a CJK input with
	    **  a CJK output (HTCJK != NOCJK). - FM
	    */
	    intm_ucs = FALSE;
	    pT->trans_to_uni = FALSE;
	    use_ucs = FALSE;
	    pT->do_8bitraw = FALSE;
	    pT->pass_160_173_raw = TRUE;
	    pT->use_raw_char_in = FALSE; /* Not used for CJK. - KW */
	    pT->repl_translated_C0 = FALSE;
	    pT->trans_from_uni = FALSE;	 /* Not used for CJK. - KW */
	} else {
	    /*
	    **  Set up for all other charset combinations.
	    **  The intm_ucs flag is set TRUE if the input
	    **  charset is iso-8859-1 or UTF-8, or largely
	    **  equivalent to them, i.e., if we have UCS without
	    **  having to do a table translation.
	    */
	    intm_ucs = (BOOL) (cs_in == LATIN1 || pT->decode_utf8 ||
			(p_in->codepoints &
			 (UCT_CP_SUBSETOF_LAT1|UCT_CP_SUBSETOF_UCS2)));
	    /*
	    **  pT->trans_to_uni is set TRUE if we do not have that as
	    **  input already, and we can translate to Unicode.  Note
	    **  that UTF-8 always is converted to Unicode in functions
	    **  that use the transformation structure, so it is
	    **  treated as already Unicode here.
	    */
	    pT->trans_to_uni = (BOOL) (!intm_ucs &&
				UCCanUniTranslateFrom(cs_in));
	    /*
	    **  We set this if we are translating to Unicode and
	    **  what normally are low value control characters in
	    **  fact are encoding octets for the input charset
	    **  (presently, this applies to VISCII). - FM
	    */
	    pT->trans_C0_to_uni = (BOOL) (pT->trans_to_uni &&
				   p_in->enc == UCT_ENC_8BIT_C0);
	    /*
	    **  We set this, presently, for VISCII. - FM
	    */
	    pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
	    /*
	    **  Currently unused for any charset combination.
	    **  Should always be FALSE
	    */
	    pT->strip_raw_char_in = FALSE;
	    /*
	    **  use_ucs should be set TRUE if we have or will create
	    **  Unicode values for input octets or UTF multibytes. - FM
	    */
	    use_ucs = (BOOL) (intm_ucs || pT->trans_to_uni);
	    /*
	    **  This is set TRUE if use_ucs was set FALSE.  It is
	    **  complementary to the HTPassEightBitRaw flag, which
	    **  is set TRUE or FALSE elsewhere based on the raw mode
	    **  setting in relation to the current Display Character
	    **  Set. - FM
	    */
	    pT->do_8bitraw = (BOOL) (!use_ucs);
	    /*
	    **  This is set TRUE when 160 and 173 should not be
	    **  treated as nbsp and shy, respectively. - FM
	    */
	    pT->pass_160_173_raw = (BOOL) (!use_ucs &&
				    !(p_in->like8859 & UCT_R_8859SPECL));
	    /*
	    **  This is set when the input and output charsets match,
	    **  and they are not ones which should go through a Unicode
	    **  translation process anyway. - FM
	    */
	    pT->use_raw_char_in = (BOOL) (!pT->output_utf8 &&
				   cs_in == cs_out &&
				   !pT->trans_C0_to_uni);
	    /*
	    **  This should be set TRUE when we expect to have
	    **  done translation to Unicode or had the equivalent
	    **  as input, can translate it to our output charset,
	    **  and normally want to do so.  The latter depends on
	    **  the pT->do_8bitraw and pT->use_raw_char_in values set
	    **  above, but also on HTPassEightBitRaw in any functions
	    **  which use the transformation structure.. - FM
	    */
	    pT->trans_from_uni = (BOOL) (use_ucs && !pT->do_8bitraw &&
				  !pT->use_raw_char_in &&
				  UCCanTranslateUniTo(cs_out));
	}
    }
}

/*
**  This function initializes the transformation
**  structure by setting all its elements to
**  FALSE. - KW
*/
PUBLIC void UCTransParams_clear ARGS1(
    UCTransParams *,    pT)
{
    pT->transp = FALSE;
    pT->do_cjk = FALSE;
    pT->decode_utf8 = FALSE;
    pT->output_utf8 = FALSE;
    pT->do_8bitraw = FALSE;
    pT->use_raw_char_in = FALSE;
    pT->strip_raw_char_in = FALSE;
    pT->pass_160_173_raw = FALSE;
    pT->trans_to_uni = FALSE;
    pT->trans_C0_to_uni = FALSE;
    pT->repl_translated_C0 = FALSE;
    pT->trans_from_uni = FALSE;
}

/*
**  If terminal is in UTF-8 mode, it probably cannot understand
**  box drawing chars as (n)curses handles them.  (This may also
**  be true for other display character sets, but isn't currently
**  checked.)  In that case set the chars for hori and vert drawing
**  chars to displayable ASCII chars if '0' was requested.  They'll
**  stay as they are otherwise. - kw
*/
PUBLIC void UCSetBoxChars ARGS5(
    int,	cset,
    int *,	pvert_out,
    int *,	phori_out,
    int,	vert_in,
    int,	hori_in)
{
    if (cset >= -1 && LYCharSet_UC[cset].enc == UCT_ENC_UTF8) {
	*pvert_out = (vert_in ? vert_in : '|');
	*phori_out = (hori_in ? hori_in : '-');
    } else {
	*pvert_out = vert_in;
	*phori_out = hori_in;
    }
}

/*
**  Given an output target HTStream* (can also be a HTStructured* via
**  typecast), the target stream's put_character method, and a Unicode
**  character,  CPutUtf8_charstring() will either output the UTF8
**  encoding of the Unicode and return YES, or do nothing and return
**  NO (if conversion would be unnecessary or the Unicode character is
**  considered invalid).
**
**  [Could be used more generally, but is currently only used for &#nnnnn
**  stuff - generation of UTF8 from 8-bit encoded charsets not yet done
**  by SGML.c etc.]
*/
#define PUTC(ch) ((*myPutc)(target, (char)(ch)))
#define PUTC2(ch) ((*myPutc)(target,(char)(0x80|(0x3f &(ch)))))

PUBLIC BOOL UCPutUtf8_charstring ARGS3(
	HTStream *,	target,
	putc_func_t *,	myPutc,
	long,		code)
{
    if (code < 128)
	return NO;		/* indicate to caller we didn't handle it */
    else if   (code < 0x800L) {
	PUTC(0xc0 | (code>>6));
	PUTC2(code);
    } else if (code < 0x10000L) {
	PUTC(0xe0 | (code>>12));
	PUTC2(code>>6);
	PUTC2(code);
    } else if (code < 0x200000L) {
	PUTC(0xf0 | (code>>18));
	PUTC2(code>>12);
	PUTC2(code>>6);
	PUTC2(code);
    } else if (code < 0x4000000L) {
	PUTC(0xf8 | (code>>24));
	PUTC2(code>>18);
	PUTC2(code>>12);
	PUTC2(code>>6);
	PUTC2(code);
    } else if (code <= 0x7fffffffL) {
	PUTC(0xfc | (code>>30));
	PUTC2(code>>24);
	PUTC2(code>>18);
	PUTC2(code>>12);
	PUTC2(code>>6);
	PUTC2(code);
    } else
	return NO;
    return YES;
}

/*
**  This function converts a Unicode (UCode_t) value
**  to a multibyte UTF-8 character, which is loaded
**  into the buffer received as an argument.  The
**  buffer should be large enough to hold at least
**  seven characters (but should be declared as 8
**  to minimize byte alignment problems with some
**  compilers). - FM
*/
PUBLIC BOOL UCConvertUniToUtf8 ARGS2(
	UCode_t,	code,
	char *,		buffer)
{
    char *ch = buffer;

    if (!ch)
	return NO;

    if (code <= 0 || code > 0x7fffffffL) {
	*ch = '\0';
	return NO;
    }

    if (code < 0x800L) {
	*ch++ = (char)(0xc0 | (code>>6));
	*ch++ = (char)(0x80 | (0x3f & (code)));
	*ch = '\0';
    } else if (code < 0x10000L) {
	*ch++ = (char)(0xe0 | (code>>12));
	*ch++ = (char)(0x80 | (0x3f & (code>>6)));
	*ch++ = (char)(0x80 | (0x3f & (code)));
	*ch = '\0';
    } else if (code < 0x200000L) {
	*ch++ = (char)(0xf0 | (code>>18));
	*ch++ = (char)(0x80 | (0x3f & (code>>12)));
	*ch++ = (char)(0x80 | (0x3f & (code>>6)));
	*ch++ = (char)(0x80 | (0x3f & (code)));
	*ch = '\0';
    } else if (code < 0x4000000L) {
	*ch++ = (char)(0xf8 | (code>>24));
	*ch++ = (char)(0x80 | (0x3f & (code>>18)));
	*ch++ = (char)(0x80 | (0x3f & (code>>12)));
	*ch++ = (char)(0x80 | (0x3f & (code>>6)));
	*ch++ = (char)(0x80 | (0x3f & (code)));
	*ch = '\0';
    } else {
	*ch++ = (char)(0xfc | (code>>30));
	*ch++ = (char)(0x80 | (0x3f & (code>>24)));
	*ch++ = (char)(0x80 | (0x3f & (code>>18)));
	*ch++ = (char)(0x80 | (0x3f & (code>>12)));
	*ch++ = (char)(0x80 | (0x3f & (code>>6)));
	*ch++ = (char)(0x80 | (0x3f & (code)));
	*ch = '\0';
    }
    return YES;
}

/*
** Get UCS character code for one character from UTF-8 encoded string.
**
** On entry:
**	*ppuni should point to beginning of UTF-8 encoding character
** On exit:
**	*ppuni is advanced to point to the last byte of UTF-8 sequence,
**		if there was a valid one; otherwise unchanged.
** returns the UCS value
** returns negative value on error (invalid UTF-8 sequence)
*/
PUBLIC UCode_t UCGetUniFromUtf8String ARGS1(char **, ppuni)
{
    UCode_t uc_out = 0;
    char * p = *ppuni;
    int utf_count, i;
    if (!(**ppuni&0x80))
	return (UCode_t) **ppuni; /* ASCII range character */
    else if (!(**ppuni&0x40))
	return (-1);		/* not a valid UTF-8 start */
    if ((*p & 0xe0) == 0xc0) {
	utf_count = 1;
    } else if ((*p & 0xf0) == 0xe0) {
	utf_count = 2;
    } else if ((*p & 0xf8) == 0xf0) {
	utf_count = 3;
    } else if ((*p & 0xfc) == 0xf8) {
	utf_count = 4;
    } else if ((*p & 0xfe) == 0xfc) {
	utf_count = 5;
    } else { /* garbage */
	return (-1);
    }
    for (p = *ppuni, i = 0; i < utf_count ; i++) {
	if ((*(++p) & 0xc0) != 0x80)
	    return (-1);
    }
    p = *ppuni;
    switch (utf_count) {
    case 1:
	uc_out = (((*p&0x1f) << 6) | (*(p+1)&0x3f));
	break;
    case 2:
	uc_out = (((((*p&0x0f) << 6) | (*(p+1)&0x3f)) << 6) | (*(p+2)&0x3f));
	break;
    case 3:
	uc_out = (((((((*p&0x07) << 6) | (*(p+1)&0x3f)) << 6) | (*(p+2)&0x3f)) << 6)
	    | (*(p+3)&0x3f));
	break;
    case 4:
	uc_out = (((((((((*p&0x03) << 6) | (*(p+1)&0x3f)) << 6) | (*(p+2)&0x3f)) << 6)
		  | (*(p+3)&0x3f)) << 6) | (*(p+4)&0x3f));
	break;
    case 5:
	uc_out = (((((((((((*p&0x01) << 6) | (*(p+1)&0x3f)) << 6) | (*(p+2)&0x3f)) << 6)
		  | (*(p+3)&0x3f)) << 6) | (*(p+4)&0x3f)) << 6) | (*(p+5)&0x3f));
	break;
    }
    *ppuni = p + utf_count;
    return uc_out;
}