/* General SGML Parser code SGML.c ** ======================== ** ** This module implements an HTStream object. To parse an ** SGML file, create this object which is a parser. The object ** is (currently) created by being passed a DTD structure, ** and a target HTStructured object at which to throw the parsed stuff. ** ** 6 Feb 93 Binary searches used. Interface modified. */ #include /* Remove the following to disable the experimental HTML DTD parsing. Currently only used in this source file. - kw */ #ifndef NO_EXTENDED_HTMLDTD #define EXTENDED_HTMLDTD #endif #include #include #include #include #include #include #include #include #include /* S/390 -- gil -- 0635 */ #include #include #ifdef USE_COLOR_STYLE # include #endif #ifdef USE_PRETTYSRC # include #endif #if 0 #ifdef CJK_EX /* 1997/12/12 (Fri) 16:54:58 */ extern HTkcode last_kcode; #endif #endif #define INVALID (-1) #ifdef USE_PRETTYSRC # define PSRC(x) if (psrc_view) { x }; # define NPSRC(x) if (!psrc_view) { x }; # define IFDEFPSRC(x) x # define IFNDEFPSRC(x) char* entity_string; /* this is used for printing entity name. Unconditionally added since redundant assigments don't hurt much*/ PRIVATE void fake_put_character ARGS2( void*, p GCC_UNUSED, char, c GCC_UNUSED) { } #define START TRUE #define STOP FALSE #define PUTS_TR(x) psrc_convert_string = TRUE; PUTS(x) #else # define PSRC(x) # define NPSRC(x) # define IFDEFPSRC(x) # define IFNDEFPSRC(x) x #endif /* will use an inlined version */ #ifdef USE_INLINE_PUTC #undef HTChunkPutc #define HTChunkPutc(ch,c)\ if (ch->size >= ch->allocated) {\ ch->allocated = ch->allocated + ch->growby;\ ch->data = ch->data ? (char *)realloc(ch->data, ch->allocated)\ : (char *)calloc(1, ch->allocated);\ if (!ch->data)\ outofmem(__FILE__, "HTChunkPutc");\ }\ ch->data[ch->size++] = c; #endif #define PUTS(str) ((*context->actions->put_string)(context->target, str)) #define OPT 1 /*the following macros are used for pretty source view. */ #define IS_C(attr) (attr.type == HTMLA_CLASS) PUBLIC HTCJKlang HTCJK = NOCJK; /* CJK enum value. */ PUBLIC BOOL HTPassEightBitRaw = FALSE; /* Pass 161-172,174-255 raw. */ PUBLIC BOOL HTPassEightBitNum = FALSE; /* Pass ^ numeric entities raw. */ PUBLIC BOOL HTPassHighCtrlRaw = FALSE; /* Pass 127-160,173, raw. */ PUBLIC BOOL HTPassHighCtrlNum = FALSE; /* Pass €-Ÿ raw. */ /* extern int LYlowest_eightbit[]; for completeness here */ /* The State (context) of the parser ** ** This is passed with each call to make the parser reentrant ** */ #define MAX_ATTRIBUTES 36 /* Max number of attributes per element */ /* Element Stack ** ------------- ** This allows us to return down the stack reselecting styles. ** As we return, attribute values will be garbage in general. */ typedef struct _HTElement HTElement; struct _HTElement { HTElement * next; /* Previously nested element or 0 */ HTTag* tag; /* The tag at this level */ }; /* Internal Context Data Structure ** ------------------------------- */ struct _HTStream { CONST HTStreamClass * isa; /* inherited from HTStream */ CONST SGML_dtd *dtd; CONST HTStructuredClass *actions; /* target class */ HTStructured *target; /* target object */ HTTag *current_tag; HTTag *slashedtag; CONST HTTag *unknown_tag; BOOL inSELECT; BOOL no_lynx_specialcodes; int current_attribute_number; HTChunk *string; HTElement *element_stack; enum sgml_state { S_text, S_tagname_slash, S_pcdata, S_litteral, S_tag, S_tag_gap, S_attr, S_attr_gap, S_equals, S_value, S_ero, S_cro, S_incro, S_exclamation, S_comment, S_doctype, S_marked, S_sgmlent, S_sgmlele, S_sgmlatt, S_squoted, S_dquoted, S_end, S_entity, S_esc, S_dollar, S_paren, S_nonascii_text, S_dollar_paren, S_esc_sq, S_dollar_sq, S_paren_sq, S_nonascii_text_sq, S_dollar_paren_sq, S_esc_dq, S_dollar_dq, S_paren_dq, S_nonascii_text_dq, S_dollar_paren_dq, S_in_kanji, S_junk_tag, S_junk_pi} state; unsigned char kanji_buf; #ifdef CALLERDATA void * callerData; #endif /* CALLERDATA */ BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */ char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */ BOOL lead_exclamation; BOOL first_dash; BOOL end_comment; BOOL doctype_bracket; BOOL first_bracket; BOOL second_bracket; BOOL isHex; HTParentAnchor * node_anchor; LYUCcharset * inUCI; /* pointer to anchor UCInfo */ int inUCLYhndl; /* charset we are fed */ LYUCcharset * outUCI; /* anchor UCInfo for target */ int outUCLYhndl; /* charset for target */ char utf_count; UCode_t utf_char; char utf_buf[8]; char * utf_buf_p; UCTransParams T; int current_tag_charset; /* charset to pass attributes */ char * recover; int recover_index; char * include; char * active_include; int include_index; char * url; char * csi; int csi_index; #ifdef USE_PRETTYSRC BOOL cur_attr_is_href; BOOL cur_attr_is_name; BOOL seen_nonwhite_in_junk_tag; #endif }; #ifdef USE_PRETTYSRC PRIVATE void HTMLSRC_apply_markup ARGS3( HTStream *, context, HTlexeme, lexeme, BOOL, start) { HT_tagspec* ts = *( ( start ? lexeme_start : lexeme_end ) + lexeme); while (ts) { #ifdef USE_COLOR_STYLE if (ts->start) { current_tag_style = ts->style; force_current_tag_style = TRUE; forced_classname = ts->class_name; force_classname = TRUE; } #endif CTRACE((tfp,ts->start ? "SRCSTART %d\n" : "SRCSTOP %d\n",(int)lexeme)); if (ts->start) (*context->actions->start_element)( context->target, ts->element, ts->present, (CONST char **)ts->value, context->current_tag_charset, (char **)&context->include); else (*context->actions->end_element)( context->target, ts->element, (char **)&context->include); ts = ts->next; } } #if ANSI_PREPRO # define PSRCSTART(x) HTMLSRC_apply_markup(context,HTL_##x,START) # define PSRCSTOP(x) HTMLSRC_apply_markup(context,HTL_##x,STOP) #else # define PSRCSTART(x) HTMLSRC_apply_markup(context,HTL_/**/x,START) # define PSRCSTOP(x) HTMLSRC_apply_markup(context,HTL_/**/x,STOP) #endif #define attr_is_href context->cur_attr_is_href #define attr_is_name context->cur_attr_is_name #endif PRIVATE void set_chartrans_handling ARGS3( HTStream *, context, HTParentAnchor *, anchor, int, chndl) { if (chndl < 0) { /* ** Nothing was set for the parser in earlier stages, ** so the HTML parser's UCLYhndl should still be its ** default. - FM */ chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_STRUCTURED); if (chndl < 0) /* ** That wasn't set either, so seek the HText default. - FM */ chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT); if (chndl < 0) /* ** That wasn't set either, so assume the current display ** character set. - FM */ chndl = current_char_set; /* ** Try to set the HText and HTML stages' chartrans info ** with the default lock level (will not be changed if ** it was set previously with a higher lock level). - FM */ HTAnchor_setUCInfoStage(anchor, chndl, UCT_STAGE_HTEXT, UCT_SETBY_DEFAULT); HTAnchor_setUCInfoStage(anchor, chndl, UCT_STAGE_STRUCTURED, UCT_SETBY_DEFAULT); /* ** Get the chartrans info for output to the HTML parser. - FM */ context->outUCI = HTAnchor_getUCInfoStage(anchor, UCT_STAGE_STRUCTURED); context->outUCLYhndl = HTAnchor_getUCLYhndl(context->node_anchor, UCT_STAGE_STRUCTURED); } /* ** Set the in->out transformation parameters. - FM */ UCSetTransParams(&context->T, context->inUCLYhndl, context->inUCI, context->outUCLYhndl, context->outUCI); /* ** This is intended for passing the SGML parser's input ** charset as an argument in each call to the HTML ** parser's start tag function, but it would be better ** to call a Lynx_HTML_parser function to set an element ** in its HTStructured object, itself, if this were ** needed. - FM */ if (HTCJK != NOCJK) { context->current_tag_charset = -1; } else if (context->T.transp) { context->current_tag_charset = context->inUCLYhndl; } else if (context->T.decode_utf8) { context->current_tag_charset = context->inUCLYhndl; } else if (context->T.do_8bitraw || context->T.use_raw_char_in) { context->current_tag_charset = context->inUCLYhndl; } else if (context->T.output_utf8 || context->T.trans_from_uni) { context->current_tag_charset = UCGetLYhndl_byMIME("utf-8"); } else { context->current_tag_charset = LATIN1; } } PRIVATE void change_chartrans_handling ARGS1( HTStream *, context) { int new_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor, UCT_STAGE_PARSER); if (new_LYhndl != context->inUCLYhndl && new_LYhndl >= 0) { /* * Something changed. but ignore if a META wants an unknown charset. */ LYUCcharset * new_UCI = HTAnchor_getUCInfoStage(context->node_anchor, UCT_STAGE_PARSER); if (new_UCI) { LYUCcharset * next_UCI = HTAnchor_getUCInfoStage( context->node_anchor, UCT_STAGE_STRUCTURED ); int next_LYhndl = HTAnchor_getUCLYhndl( context->node_anchor, UCT_STAGE_STRUCTURED ); context->inUCI = new_UCI; context->inUCLYhndl = new_LYhndl; context->outUCI = next_UCI; context->outUCLYhndl = next_LYhndl; set_chartrans_handling(context, context->node_anchor, next_LYhndl); } } } #define PUTC(ch) ((*context->actions->put_character)(context->target, ch)) #define PUTUTF8(code) (UCPutUtf8_charstring((HTStream *)context->target, \ (putc_func_t*)(context->actions->put_character), code)) extern BOOL historical_comments; extern BOOL minimal_comments; extern BOOL soft_dquotes; #ifdef USE_COLOR_STYLE #include static int current_is_class=0; #endif /* Handle Attribute ** ---------------- */ /* PUBLIC CONST char * SGML_default = ""; ?? */ PRIVATE void handle_attribute_name ARGS2( HTStream *, context, CONST char *, s) { HTTag * tag = context->current_tag; attr * attributes = tag->attributes; int high, low, i, diff; #ifdef USE_PRETTYSRC if (psrc_view) { attr_is_href = FALSE; attr_is_name = FALSE; } #endif /* ** Ignore unknown tag. - KW */ if (tag == context->unknown_tag) { #ifdef USE_PRETTYSRC if (psrc_view) context->current_attribute_number = 1; /* anything !=INVALID */ #endif return; } /* ** Binary search for attribute name. */ for (low = 0, high = tag->number_of_attributes; high > low; diff < 0 ? (low = i+1) : (high = i)) { i = (low + (high-low)/2); diff = strcasecomp(attributes[i].name, s); if (diff == 0) { /* success: found it */ context->current_attribute_number = i; #ifdef USE_PRETTYSRC if (!psrc_view) { #endif context->present[i] = YES; FREE(context->value[i]); #ifdef USE_COLOR_STYLE # ifdef USE_PRETTYSRC current_is_class = IS_C(attributes[i]); # else current_is_class = (!strcasecomp("class", s)); # endif CTRACE((tfp, "SGML: found attribute %s, %d\n", s, current_is_class)); #endif #ifdef USE_PRETTYSRC } else { attr_is_name = (BOOL) (attributes[i].type == HTMLA_ANAME); attr_is_href = (BOOL) (attributes[i].type == HTMLA_HREF); } #endif return; } /* if */ } /* for */ CTRACE((tfp, "SGML: Unknown attribute %s for tag %s\n", s, context->current_tag->name)); context->current_attribute_number = INVALID; /* Invalid */ } /* Handle attribute value ** ---------------------- */ PRIVATE void handle_attribute_value ARGS2( HTStream *, context, CONST char *, s) { if (context->current_attribute_number != INVALID) { StrAllocCopy(context->value[context->current_attribute_number], s); #ifdef USE_COLOR_STYLE if (current_is_class) { strncpy (class_string, s, TEMPSTRINGSIZE); CTRACE((tfp, "SGML: class is '%s'\n", s)); } else { CTRACE((tfp, "SGML: attribute value is '%s'\n", s)); } #endif } else { CTRACE((tfp, "SGML: Attribute value %s ***ignored\n", s)); } context->current_attribute_number = INVALID; /* can't have two assignments! */ } /* ** Translate some Unicodes to Lynx special codes and output them. ** Special codes - ones those output depend on parsing. ** ** Additional issue, like handling bidirectional text if necessary ** may be called from here: zwnj (8204), zwj (8205), lrm (8206), rlm (8207) ** - currently they are ignored in SGML.c and LYCharUtils.c ** but also in UCdomap.c because they are non printable... ** */ PRIVATE BOOL put_special_unicodes ARGS2( HTStream *, context, UCode_t, code) { /* (Tgf_nolyspcl) */ if (context->no_lynx_specialcodes) { /* ** We were asked by a "DTD" flag to not generate lynx specials. - kw */ return NO; } if (code == CH_NBSP) { /* S/390 -- gil -- 0657 */ /* ** Use Lynx special character for nbsp. */ #ifdef USE_PRETTYSRC if (!psrc_view) #endif PUTC(HT_NON_BREAK_SPACE); } else if (code == CH_SHY) { /* ** Use Lynx special character for shy. */ #ifdef USE_PRETTYSRC if (!psrc_view) #endif PUTC(LY_SOFT_HYPHEN); } else if (code == 8194 || code == 8201) { /* ** Use Lynx special character for ensp or thinsp. ** ** Originally, Lynx use space '32' as word delimiter and omits this ** space at end of line if word is wrapped to the next line. There ** are several other spaces in the Unicode repertoire and we should ** teach Lynx to understand them, not only as regular characters but ** in the context of line wrapping. Unfortunately, if we use ** HT_EN_SPACE we override the chartrans tables for those spaces ** with a single '32' for all (but do line wrapping more fancy). ** ** We may treat emsp as one or two ensp (below). */ #ifdef USE_PRETTYSRC if (!psrc_view) #endif PUTC(HT_EN_SPACE); } else if (code == 8195) { /* ** Use Lynx special character for emsp. */ #ifdef USE_PRETTYSRC if (!psrc_view) { #endif /* PUTC(HT_EN_SPACE); let's stay with a single space :) */ PUTC(HT_EN_SPACE); #ifdef USE_PRETTYSRC } #endif } else { /* ** Return NO if nothing done. */ return NO; } /* ** We have handled it. */ return YES; } /* Handle entity ** ------------- ** ** On entry, ** s contains the entity name zero terminated ** Bugs: ** If the entity name is unknown, the terminator is treated as ** a printable non-special character in all cases, even if it is '<' ** Bug-fix: ** Modified SGML_character() so we only come here with terminator ** as '\0' and check a FoundEntity flag. -- Foteos Macrides ** ** Modified more (for use with Lynx character translation code): */ PRIVATE char replace_buf [64]; /* buffer for replacement strings */ PRIVATE BOOL FoundEntity = FALSE; PRIVATE void handle_entity ARGS2( HTStream *, context, char, term) { UCode_t code; long uck = -1; CONST char *p; CONST char *s = context->string->data; /* ** Handle all entities normally. - FM */ FoundEntity = FALSE; if ((code = HTMLGetEntityUCValue(s)) != 0) { /* ** We got a Unicode value for the entity name. ** Check for special Unicodes. - FM */ if (put_special_unicodes(context, code)) { #ifdef USE_PRETTYSRC if (psrc_view) { HTMLSRC_apply_markup(context,HTL_entity,START); PUTC('&'); PUTS(entity_string); if (term) PUTC(term); HTMLSRC_apply_markup(context,HTL_entity,STOP); } #endif FoundEntity = TRUE; return; } /* ** Seek a translation from the chartrans tables. */ if ((uck = UCTransUniChar(code, context->outUCLYhndl)) >= 32 && /* =============== work in ASCII below here =============== S/390 -- gil -- 0672 */ uck < 256 && (uck < 127 || uck >= LYlowest_eightbit[context->outUCLYhndl])) { #ifdef USE_PRETTYSRC if (psrc_view) { HTMLSRC_apply_markup(context,HTL_entity,START); PUTC('&'); PUTS(entity_string); if (term) PUTC(term); HTMLSRC_apply_markup(context,HTL_entity,STOP); } else #endif PUTC(FROMASCII((char)uck)); FoundEntity = TRUE; return; } else if ((uck == -4 || (context->T.repl_translated_C0 && uck > 0 && uck < 32)) && /* ** Not found; look for replacement string. */ (uck = UCTransUniCharStr(replace_buf, 60, code, context->outUCLYhndl, 0) >= 0)) { #ifdef USE_PRETTYSRC if (psrc_view) { HTMLSRC_apply_markup(context,HTL_entity,START); PUTC('&'); PUTS(entity_string); if (term) PUTC(term); HTMLSRC_apply_markup(context,HTL_entity,STOP); } else #endif for (p = replace_buf; *p; p++) PUTC(*p); FoundEntity = TRUE; return; } /* ** If we're displaying UTF-8, try that now. - FM */ #ifndef USE_PRETTYSRC if (context->T.output_utf8 && PUTUTF8(code)) { FoundEntity = TRUE; return; } #else if (context->T.output_utf8 && (psrc_view ? (UCPutUtf8_charstring((HTStream *)context->target, (putc_func_t*)(fake_put_character), code)): PUTUTF8(code) ) ) { if (psrc_view) { HTMLSRC_apply_markup(context,HTL_entity,START); PUTC('&'); PUTS(entity_string); if (term) PUTC(term); HTMLSRC_apply_markup(context,HTL_entity,STOP); } FoundEntity = TRUE; return; } #endif /* ** If it's safe ASCII, use it. - FM */ if (code >= 32 && code < 127) { #ifdef USE_PRETTYSRC if (psrc_view) { HTMLSRC_apply_markup(context,HTL_entity,START); PUTC('&'); PUTS(entity_string); if (term) PUTC(term); HTMLSRC_apply_markup(context,HTL_entity,STOP); } else #endif PUTC(FROMASCII((char)code)); FoundEntity = TRUE; return; } /* =============== work in ASCII above here =============== S/390 -- gil -- 0682 */ /* ** Ignore zwnj (8204) and zwj (8205), if we get to here. ** Note that zwnj may have been handled as ** by the calling function. - FM */ if (!strcmp(s, "zwnj") || !strcmp(s, "zwj")) { CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s)); #ifdef USE_PRETTYSRC if (psrc_view) { HTMLSRC_apply_markup(context,HTL_entity,START); PUTC('&'); PUTS(entity_string); if (term) PUTC(term); HTMLSRC_apply_markup(context,HTL_entity,STOP); } #endif FoundEntity = TRUE; return; } /* ** Ignore lrm (8206), and rln (8207), if we get to here. - FM */ if (!strcmp(s, "lrm") || !strcmp(s, "rlm")) { CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s)); #ifdef USE_PRETTYSRC if (psrc_view) { HTMLSRC_apply_markup(context,HTL_entity,START); PUTC('&'); PUTS(entity_string); if (term) PUTC(term); HTMLSRC_apply_markup(context,HTL_entity,STOP); } #endif FoundEntity = TRUE; return; } } /* ** If entity string not found, display as text. */ #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTART(badseq); #endif CTRACE((tfp, "SGML: Unknown entity '%s' %ld %ld\n", s, (long)code, uck)); /* S/390 -- gil -- 0695 */ PUTC('&'); for (p = s; *p; p++) { PUTC(*p); } if (term != '\0') PUTC(term); #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTOP(badseq); #endif } /* Handle comment ** -------------- */ PRIVATE void handle_comment ARGS1( HTStream *, context) { CONST char *s = context->string->data; CTRACE((tfp, "SGML Comment:\n<%s>\n", s)); if (context->csi == NULL && strncmp(s, "!--#", 4) == 0 && LYCheckForCSI(context->node_anchor, (char **)&context->url) == TRUE) { LYDoCSI(context->url, s, (char **)&context->csi); } else { LYCommentHacks(context->node_anchor, context->string->data); } return; } /* Handle identifier ** ----------------- */ PRIVATE void handle_identifier ARGS1( HTStream *, context) { CONST char *s = context->string->data; CTRACE((tfp, "SGML Identifier:\n<%s>\n", s)); return; } /* Handle doctype ** -------------- */ PRIVATE void handle_doctype ARGS1( HTStream *, context) { CONST char *s = context->string->data; CTRACE((tfp, "SGML Doctype:\n<%s>\n", s)); return; } PRIVATE void SGML_write PARAMS(( HTStream * me, CONST char * s, int l)); /* Handle marked ** ------------- */ PRIVATE void handle_marked ARGS1( HTStream *, context) { CONST char *s = context->string->data; CTRACE((tfp, "SGML Marked Section:\n<%s>\n", s)); if (!strncmp(context->string->data, "![INCLUDE[", 10)) { context->string->data[context->string->size - 3] = '\0'; StrAllocCat(context->include, context->string->data + 10); /* @@@ This needs to take charset into account! @@@ the wrong assumptions will be made about the data's charset once it is in include - kw */ } else if (!strncmp(context->string->data, "![CDATA[", 8)) { (*context->actions->_write)(context->target, context->string->data + 8, context->string->size - 11); } return; } /* Handle sgmlent ** -------------- */ PRIVATE void handle_sgmlent ARGS1( HTStream *, context) { CONST char *s = context->string->data; CTRACE((tfp, "SGML Entity Declaration:\n<%s>\n", s)); return; } /* Handle sgmlent ** -------------- */ PRIVATE void handle_sgmlele ARGS1( HTStream *, context) { CONST char *s = context->string->data; CTRACE((tfp, "SGML Element Declaration:\n<%s>\n", s)); return; } /* Handle sgmlatt ** -------------- */ PRIVATE void handle_sgmlatt ARGS1( HTStream *, context) { CONST char *s = context->string->data; CTRACE((tfp, "SGML Attribute Declaration:\n<%s>\n", s)); return; } /* * Convenience macros - tags (elements) are identified sometimes * by an int or enum value ('TAGNUM'), sometimes * by a pointer to HTTag ('TAGP'). - kw */ #define TAGNUM_OF_TAGP(t) (t - context->dtd->tags) #define TAGP_OF_TAGNUM(e) (context->dtd->tags + e) /* * The following implement special knowledge about OBJECT. * As long as HTML_OBJECT is the only tag for which an alternative * variant exist, they can be simple macros. - kw */ /* does 'TAGNUM' e have an alternative (variant) parsing mode? */ #define HAS_ALT_TAGNUM(e) (e == HTML_OBJECT) /* return 'TAGNUM' of the alternative mode for 'TAGNUM' e, if any. */ #define ALT_TAGNUM(e) ((e == HTML_OBJECT) ? HTML_ALT_OBJECT : e) /* return 'TAGNUM' of the normal mode for 'TAGNUM' e which may be alt. */ #define NORMAL_TAGNUM(e) ((e >= HTML_ELEMENTS) ? HTML_OBJECT : e) /* More convenience stuff. - kw */ #define ALT_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(ALT_TAGNUM(e)) #define NORMAL_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(NORMAL_TAGNUM(e)) #define ALT_TAGP(t) ALT_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t)) #define NORMAL_TAGP(t) NORMAL_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t)) #ifdef EXTENDED_HTMLDTD PRIVATE BOOL element_valid_within ARGS3( HTTag *, new_tag, HTTag *, stacked_tag, BOOL, direct) { TagClass usecontains, usecontained; if (!stacked_tag || !new_tag) return YES; usecontains = (direct ? stacked_tag->contains : stacked_tag->icontains); usecontained = (direct ? new_tag->contained : new_tag->icontained); if (new_tag == stacked_tag) return (BOOL) ((Tgc_same & usecontains) && (Tgc_same & usecontained)); else return (BOOL) ((new_tag->tagclass & usecontains) && (stacked_tag->tagclass & usecontained)); } extern BOOL Old_DTD; typedef enum { close_NO = 0, close_error = 1, close_valid = 2 } canclose_t; PRIVATE canclose_t can_close ARGS2( HTTag *, new_tag, HTTag *, stacked_tag) { if (!stacked_tag) return close_NO; if (stacked_tag->flags & Tgf_endO) return close_valid; else if (new_tag == stacked_tag) return ((Tgc_same & new_tag->canclose) ? close_error : close_NO); else return ((stacked_tag->tagclass & new_tag->canclose) ? close_error : close_NO); } PRIVATE void do_close_stacked ARGS1( HTStream *, context) { HTElement * stacked = context->element_stack; HTMLElement e; if (!stacked) return; /* stack was empty */ if (context->inSELECT && !strcasecomp(stacked->tag->name, "SELECT")) { context->inSELECT = FALSE; } e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(stacked->tag)); #ifdef USE_PRETTYSRC if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */ #endif (*context->actions->end_element)( context->target, e, (char **)&context->include); context->element_stack = stacked->next; FREE(stacked); context->no_lynx_specialcodes = context->element_stack ? (context->element_stack->tag->flags & Tgf_nolyspcl) : NO; } PRIVATE int is_on_stack ARGS2( HTStream *, context, HTTag *, old_tag) { HTElement * stacked = context->element_stack; int i = 1; for (; stacked; stacked = stacked->next, i++) { if (stacked->tag == old_tag || stacked->tag == ALT_TAGP(old_tag)) return i; } return 0; } #endif /* EXTENDED_HTMLDTD */ /* End element ** ----------- */ PRIVATE void end_element ARGS2( HTStream *, context, HTTag *, old_tag) { #ifdef EXTENDED_HTMLDTD BOOL extra_action_taken = NO; canclose_t canclose_check = close_valid; int stackpos = is_on_stack(context, old_tag); if (!Old_DTD) { while (canclose_check != close_NO && context->element_stack && (stackpos > 1 || (!extra_action_taken && stackpos == 0))) { if (stackpos == 0 && (old_tag->flags & Tgf_startO) && element_valid_within(old_tag, context->element_stack->tag, YES)) { CTRACE((tfp, "SGML: ignored\n", old_tag->name)); return; } canclose_check = can_close(old_tag, context->element_stack->tag); if (canclose_check != close_NO) { CTRACE((tfp, "SGML: End \t<- %s end \n", context->element_stack->tag->name, canclose_check == close_valid ? "supplied," : "***forced by", old_tag->name)); do_close_stacked(context); extra_action_taken = YES; stackpos = is_on_stack(context, old_tag); #if 0 /* done below with more specific message - kw */ } else { CTRACE((tfp, "SGML: Still open %s \t<- ***invalid end \n", context->element_stack->tag->name, old_tag->name)); return; #endif } } if (stackpos == 0 && old_tag->contents != SGML_EMPTY) { CTRACE((tfp, "SGML: Still open %s, ***no open %s for \n", context->element_stack ? context->element_stack->tag->name : "none", old_tag->name, old_tag->name)); return; } if (stackpos > 1) { CTRACE((tfp, "SGML: Nesting <%s>...<%s> \t<- ***invalid end \n", old_tag->name, context->element_stack->tag->name, old_tag->name)); return; } } /* Now let the non-extended code deal with the rest. - kw */ #endif /* EXTENDED_HTMLDTD */ /* ** If we are in a SELECT block, ignore anything ** but a SELECT end tag. - FM */ if (context->inSELECT) { if (!strcasecomp(old_tag->name, "SELECT")) { /* ** Turn off the inSELECT flag and fall through. - FM */ context->inSELECT = FALSE; } else { /* ** Ignore the end tag. - FM */ CTRACE((tfp, "SGML: ***Ignoring end tag in SELECT block.\n", old_tag->name)); return; } } /* ** Handle the end tag. - FM */ CTRACE((tfp, "SGML: End \n", old_tag->name)); if (old_tag->contents == SGML_EMPTY) { CTRACE((tfp, "SGML: ***Illegal end tag found.\n", old_tag->name)); return; } #ifdef WIND_DOWN_STACK while (context->element_stack) /* Loop is error path only */ #else if (context->element_stack) /* Substitute and remove one stack element */ #endif /* WIND_DOWN_STACK */ { int status = HT_OK; HTMLElement e; HTElement * N = context->element_stack; HTTag * t = (N->tag != old_tag) ? NORMAL_TAGP(N->tag) : N->tag; if (old_tag != t) { /* Mismatch: syntax error */ if (context->element_stack->next) { /* This is not the last level */ CTRACE((tfp, "SGML: Found when expecting . ***assumed.\n", old_tag->name, t->name, t->name)); } else { /* last level */ CTRACE((tfp, "SGML: Found when expecting . ***Ignored.\n", old_tag->name, t->name, old_tag->name)); return; /* Ignore */ } } e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(t)); #ifdef USE_PRETTYSRC if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */ #endif status = (*context->actions->end_element)(context->target, e, (char **)&context->include); if (status == HT_PARSER_REOPEN_ELT) { CTRACE((tfp, "SGML: Restart <%s>\n", t->name)); (*context->actions->start_element)( context->target, e, NULL, NULL, context->current_tag_charset, (char **)&context->include); } else if (status == HT_PARSER_OTHER_CONTENT) { CTRACE((tfp, "SGML: Continue with other content model for <%s>\n", t->name)); context->element_stack->tag = ALT_TAGP_OF_TAGNUM(e); } else { context->element_stack = N->next; /* Remove from stack */ FREE(N); } context->no_lynx_specialcodes = context->element_stack ? (context->element_stack->tag->flags & Tgf_nolyspcl) : NO; #ifdef WIND_DOWN_STACK if (old_tag == t) return; /* Correct sequence */ #else return; #endif /* WIND_DOWN_STACK */ /* Syntax error path only */ } CTRACE((tfp, "SGML: Extra end tag found and ignored.\n", old_tag->name)); } /* Start a element */ PRIVATE void start_element ARGS1( HTStream *, context) { int status; HTTag * new_tag = context->current_tag; HTMLElement e = TAGNUM_OF_TAGP(new_tag); BOOL ok = FALSE; #ifdef EXTENDED_HTMLDTD BOOL valid = YES; BOOL direct_container = YES; BOOL extra_action_taken = NO; canclose_t canclose_check = close_valid; if (!Old_DTD) { while (context->element_stack && (canclose_check == close_valid || (canclose_check == close_error && new_tag == context->element_stack->tag)) && !(valid = element_valid_within(new_tag, context->element_stack->tag, direct_container))) { canclose_check = can_close(new_tag, context->element_stack->tag); if (canclose_check != close_NO) { CTRACE((tfp, "SGML: End \t<- %s start <%s>\n", context->element_stack->tag->name, canclose_check == close_valid ? "supplied," : "***forced by", new_tag->name)); do_close_stacked(context); extra_action_taken = YES; if (canclose_check == close_error) direct_container = NO; } else { CTRACE((tfp, "SGML: Still open %s \t<- ***invalid start <%s>\n", context->element_stack->tag->name, new_tag->name)); } } if (context->element_stack && !valid && (context->element_stack->tag->flags & Tgf_strict) && !(valid = element_valid_within(new_tag, context->element_stack->tag, direct_container))) { CTRACE((tfp, "SGML: Still open %s \t<- ***ignoring start <%s>\n", context->element_stack->tag->name, new_tag->name)); return; } if (context->element_stack && !extra_action_taken && canclose_check == close_NO && !valid && (new_tag->flags & Tgf_mafse)) { BOOL has_attributes = NO; int i = 0; for (; i< new_tag->number_of_attributes && !has_attributes; i++) has_attributes = context->present[i]; if (!has_attributes) { CTRACE((tfp, "SGML: Still open %s, ***converting invalid <%s> to \n", context->element_stack->tag->name, new_tag->name, new_tag->name)); end_element(context, new_tag); return; } } if (context->element_stack && canclose_check == close_error && !(valid = element_valid_within( new_tag, context->element_stack->tag, direct_container))) { CTRACE((tfp, "SGML: Still open %s \t<- ***invalid start <%s>\n", context->element_stack->tag->name, new_tag->name)); } } /* Fall through to the non-extended code - kw */ #endif /* EXTENDED_HTMLDTD */ /* ** If we are not in a SELECT block, check if this is ** a SELECT start tag. Otherwise (i.e., we are in a ** SELECT block) accept only OPTION as valid, terminate ** the SELECT block if it is any other form-related ** element, and otherwise ignore it. - FM */ if (!context->inSELECT) { /* ** We are not in a SELECT block, so check if this starts one. - FM */ if (!strcasecomp(new_tag->name, "SELECT")) { /* ** Set the inSELECT flag and fall through. - FM */ context->inSELECT = TRUE; } } else { /* ** We are in a SELECT block. - FM */ if (strcasecomp(new_tag->name, "OPTION")) { /* ** Ugh, it is not an OPTION. - FM */ switch (e) { case HTML_INPUT: case HTML_TEXTAREA: case HTML_SELECT: case HTML_BUTTON: case HTML_FIELDSET: case HTML_LABEL: case HTML_LEGEND: case HTML_FORM: ok = TRUE; break; default: break; } if (ok) { /* ** It is another form-related start tag, so terminate ** the current SELECT block and fall through. - FM */ CTRACE((tfp, "SGML: ***Faking SELECT end tag before <%s> start tag.\n", new_tag->name)); end_element(context, SGMLFindTag(context->dtd, "SELECT")); } else { /* ** Ignore the start tag. - FM */ CTRACE((tfp, "SGML: ***Ignoring start tag <%s> in SELECT block.\n", new_tag->name)); return; } } } /* ** Handle the start tag. - FM */ CTRACE((tfp, "SGML: Start <%s>\n", new_tag->name)); status = (*context->actions->start_element)( context->target, TAGNUM_OF_TAGP(new_tag), context->present, (CONST char**) context->value, /* coerce type for think c */ context->current_tag_charset, (char **)&context->include); if (status == HT_PARSER_OTHER_CONTENT) new_tag = ALT_TAGP(new_tag); /* this is only returned for OBJECT */ if (new_tag->contents != SGML_EMPTY) { /* i.e., tag not empty */ HTElement * N = (HTElement *)malloc(sizeof(HTElement)); if (N == NULL) outofmem(__FILE__, "start_element"); N->next = context->element_stack; N->tag = new_tag; context->element_stack = N; context->no_lynx_specialcodes = (new_tag->flags & Tgf_nolyspcl); } else if (e == HTML_META ) { /* ** Check for result of META tag. - KW & FM */ change_chartrans_handling(context); } } /* Find Tag in DTD tag list ** ------------------------ ** ** On entry, ** dtd points to dtd structure including valid tag list ** string points to name of tag in question ** ** On exit, ** returns: ** NULL tag not found ** else address of tag structure in dtd */ PUBLIC HTTag * SGMLFindTag ARGS2( CONST SGML_dtd*, dtd, CONST char *, string) { int high, low, i, diff; for (low = 0, high=dtd->number_of_tags; high > low; diff < 0 ? (low = i+1) : (high = i)) { /* Binary search */ i = (low + (high-low)/2); diff = AS_casecomp(dtd->tags[i].name, string); /* Case insensitive */ if (diff == 0) { /* success: found it */ return &dtd->tags[i]; } } if (IsNmStart(string[0])) { /* ** Unrecognized, but may be valid. - KW */ return &HTTag_unrecognized; } return NULL; } /*________________________________________________________________________ ** Public Methods */ /* Could check that we are back to bottom of stack! @@ */ /* Do check! - FM */ /* */ PRIVATE void SGML_free ARGS1( HTStream *, context) { int i; HTElement * cur; HTTag * t; /* ** Free the buffers. - FM */ FREE(context->recover); FREE(context->url); FREE(context->csi); FREE(context->include); FREE(context->active_include); /* ** Wind down stack if any elements are open. - FM */ while (context->element_stack) { cur = context->element_stack; t = cur->tag; context->element_stack = cur->next; /* Remove from stack */ FREE(cur); #ifdef USE_PRETTYSRC if (!psrc_view) /* Don't actually call on target if viewing psrc - kw */ #endif (*context->actions->end_element)(context->target, NORMAL_TAGNUM(TAGNUM_OF_TAGP(t)), (char **)&context->include); FREE(context->include); } /* ** Finish off the target. - FM */ (*context->actions->_free)(context->target); /* ** Free the strings and context structure. - FM */ HTChunkFree(context->string); for (i = 0; i < MAX_ATTRIBUTES; i++) FREE(context->value[i]); FREE(context); #ifdef USE_PRETTYSRC sgml_in_psrc_was_initialized =FALSE; #endif } PRIVATE void SGML_abort ARGS2( HTStream *, context, HTError, e) { int i; HTElement * cur; /* ** Abort the target. - FM */ (*context->actions->_abort)(context->target, e); /* ** Free the buffers. - FM */ FREE(context->recover); FREE(context->include); FREE(context->active_include); FREE(context->url); FREE(context->csi); /* ** Free stack memory if any elements were left open. - KW */ while (context->element_stack) { cur = context->element_stack; context->element_stack = cur->next; /* Remove from stack */ FREE(cur); } /* ** Free the strings and context structure. - FM */ HTChunkFree(context->string); for (i = 0; i < MAX_ATTRIBUTES; i++) FREE(context->value[i]); FREE(context); #ifdef USE_PRETTYSRC sgml_in_psrc_was_initialized =FALSE; #endif } /* Read and write user callback handle ** ----------------------------------- ** ** The callbacks from the SGML parser have an SGML context parameter. ** These calls allow the caller to associate his own context with a ** particular SGML context. */ #ifdef CALLERDATA PUBLIC void* SGML_callerData ARGS1( HTStream *, context) { return context->callerData; } PUBLIC void SGML_setCallerData ARGS2( HTStream *, context, void*, data) { context->callerData = data; } #endif /* CALLERDATA */ PRIVATE void SGML_character ARGS2( HTStream *, context, char, c_in) { CONST SGML_dtd *dtd = context->dtd; HTChunk *string = context->string; CONST char * EntityName; char * p; HTTag * testtag = NULL; BOOLEAN chk; /* Helps (?) walk through all the else ifs... */ UCode_t clong, uck = 0; /* Enough bits for UCS4 ... */ #ifdef CJK_EX unsigned char c; #else char c; #endif char saved_char_in = '\0'; /* ** Now some fun with the preprocessor. ** Use copies for c and unsign_c == clong, so that ** we can revert back to the unchanged c_in. - KW */ #define unsign_c clong #if 0 static unsigned char sjis_1st = '\0'; unsigned char sjis_hi, sjis_lo; #endif c = c_in; clong = (unsigned char)c; /* a.k.a. unsign_c */ if (context->T.decode_utf8) { /* ** Combine UTF-8 into Unicode. ** Incomplete characters silently ignored. ** From Linux kernel's console.c. - KW */ if (TOASCII((unsigned char)c) > 127) { /* S/390 -- gil -- 0710 */ /* ** We have an octet from a multibyte character. - FM */ if (context->utf_count > 0 && (TOASCII(c) & 0xc0) == 0x80) { context->utf_char = (context->utf_char << 6) | (TOASCII(c) & 0x3f); context->utf_count--; *(context->utf_buf_p) = c; (context->utf_buf_p)++; if (context->utf_count == 0) { /* ** We have all of the bytes, so terminate ** the buffer and set 'clong' to the UCode_t ** value. - FM */ *(context->utf_buf_p) = '\0'; clong = context->utf_char; if (clong < 256) { c = ((char)(clong & 0xff)); } goto top1; } else { /* ** Wait for more. - KW */ return; } } else { /* ** Start handling a new multibyte character. - FM */ context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = c; (context->utf_buf_p)++; if ((c & 0xe0) == 0xc0) { context->utf_count = 1; context->utf_char = (c & 0x1f); } else if ((c & 0xf0) == 0xe0) { context->utf_count = 2; context->utf_char = (c & 0x0f); } else if ((c & 0xf8) == 0xf0) { context->utf_count = 3; context->utf_char = (c & 0x07); } else if ((c & 0xfc) == 0xf8) { context->utf_count = 4; context->utf_char = (c & 0x03); } else if ((c & 0xfe) == 0xfc) { context->utf_count = 5; context->utf_char = (c & 0x01); } else { /* ** Garbage. - KW */ context->utf_count = 0; context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } /* ** Wait for more. - KW */ return; } } else { /* ** Got an ASCII char. - KW */ context->utf_count = 0; context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; /* goto top; */ } } /* end of context->T.decode_utf8 S/390 -- gil -- 0726 */ #ifdef NOTDEFINED /* ** If we have a koi8-r input and do not have ** koi8-r as the output, save the raw input ** in saved_char_in before we potentially ** convert it to Unicode. - FM */ if (context->T.strip_raw_char_in) saved_char_in = c; #endif /* NOTDEFINED */ /* ** If we want the raw input converted ** to Unicode, try that now. - FM */ if (context->T.trans_to_uni && ((TOASCII(unsign_c) >= LYlowest_eightbit[context->inUCLYhndl]) || /* S/390 -- gil -- 0744 */ (unsign_c < ' ' && unsign_c != 0 && context->T.trans_C0_to_uni))) { /* ** Convert the octet to Unicode. - FM */ clong = UCTransToUni(c, context->inUCLYhndl); if (clong > 0) { saved_char_in = c; if (clong < 256) { c = FROMASCII((char)clong); } } goto top1; } else if (unsign_c < ' ' && unsign_c != 0 && /* S/390 -- gil -- 0768 */ context->T.trans_C0_to_uni) { /* ** This else if may be too ugly to keep. - KW */ if (context->T.trans_from_uni && (((clong = UCTransToUni(c, context->inUCLYhndl)) >= ' ') || (context->T.transp && (clong = UCTransToUni(c, context->inUCLYhndl)) > 0))) { saved_char_in = c; if (clong < 256) { c = FROMASCII((char)clong); } goto top1; } else { uck = -1; if (context->T.transp) { uck = UCTransCharStr(replace_buf, 60, c, context->inUCLYhndl, context->inUCLYhndl, NO); } if (!context->T.transp || uck < 0) { uck = UCTransCharStr(replace_buf, 60, c, context->inUCLYhndl, context->outUCLYhndl, YES); } if (uck == 0) { return; } else if (uck < 0) { goto top0a; } c = replace_buf[0]; if (c && replace_buf[1]) { if (context->state == S_text) { for (p = replace_buf; *p; p++) PUTC(*p); return; } StrAllocCat(context->recover, replace_buf + 1); } goto top0a; } /* Next line end of ugly stuff for C0. - KW */ } else { /* end of context->T.trans_to_uni S/390 -- gil -- 0791 */ goto top0a; } /* ** At this point we have either unsign_c a.k.a. clong in ** Unicode (and c in latin1 if clong is in the latin1 range), ** or unsign_c and c will have to be passed raw. - KW */ /* ** We jump up to here from below if we have ** stuff in the recover, insert, or csi buffers ** to process. We zero saved_char_in, in effect ** as a flag that the octet in not that of the ** actual call to this function. This may be OK ** for now, for the stuff this function adds to ** its recover buffer, but it might not be for ** stuff other functions added to the insert or ** csi buffer, so bear that in mind. - FM ** Stuff from the recover buffer is now handled ** as UTF-8 if we can expect that's what it is, ** and in that case we don't come back up here. - kw */ top: saved_char_in = '\0'; /* ** We jump to here from above when we don't have ** UTF-8 input, haven't converted to Unicode, and ** want clong set to the input octet (unsigned) ** without zeroing its saved_char_in copy (which ** is signed). - FM */ top0a: *(context->utf_buf) = '\0'; clong = (unsigned char)c; /* ** We jump to here from above if we have converted ** the input, or a multibyte sequence across calls, ** to a Unicode value and loaded it into clong (to ** which unsign_c has been defined), and from below ** when we are recycling a character (e.g., because ** it terminated an entity but is not the standard ** semi-colon). The character will already have ** been put through the Unicode conversions. - FM */ top1: /* ** Ignore low ISO 646 7-bit control characters ** if HTCJK is not set. - FM */ /* ** Works for both ASCII and EBCDIC. -- gil */ /* S/390 -- gil -- 0811 */ if (TOASCII(unsign_c) < 32 && c != '\t' && c != '\n' && c != '\r' && HTCJK == NOCJK) goto after_switch; #if 0 /* This JIS X0201 Kana to JIS X0208 Kana conversion is/should be * done in the HTextAppendCharacter. -- TH */ #ifdef CJK_EX /* 1998/11/24 (Tue) 17:02:31 */ if (HTCJK == JAPANESE && last_kcode == SJIS) { if (sjis_1st == '\0' && (IS_SJIS_HI1(c) || IS_SJIS_HI2(c))) { sjis_1st = c; } else if (sjis_1st && IS_SJIS_LO(c)) { sjis_1st = '\0'; } else { if (context->state == S_text) { if (0xA1 <= (unsigned char)c && (unsigned char)c <= 0xDF) { JISx0201TO0208_SJIS(c, &sjis_hi, &sjis_lo); PUTC(sjis_hi); PUTC(sjis_lo); goto after_switch; } } } } #endif #endif /* ** Ignore 127 if we don't have HTPassHighCtrlRaw ** or HTCJK set. - FM */ #define PASSHICTRL (context->T.transp || \ unsign_c >= LYlowest_eightbit[context->inUCLYhndl]) if (TOASCII(c) == 127 && /* S/390 -- gil -- 0830 */ !(PASSHICTRL || HTCJK != NOCJK)) goto after_switch; /* ** Ignore 8-bit control characters 128 - 159 if ** neither HTPassHighCtrlRaw nor HTCJK is set. - FM */ if (TOASCII(unsign_c) > 127 && TOASCII(unsign_c) < 160 && /* S/390 -- gil -- 0847 */ !(PASSHICTRL || HTCJK != NOCJK)) goto after_switch; /* Almost all CJK characters are double byte but only Japanese * JIS X0201 Kana is single byte. To prevent to fail SGML parsing * we have to care them here. -- TH */ if ((HTCJK==JAPANESE) && (context->state==S_in_kanji) && !IS_JAPANESE_2BYTE(context->kanji_buf,(unsigned char)c)) { #ifdef CONV_JISX0201KANA_TO_JISX0208KANA if (IS_SJIS_X0201KANA(context->kanji_buf)) { JISx0201TO0208_SJIS(context->kanji_buf, &sjis_hi, &sjis_lo); PUTC(sjis_hi); PUTC(sjis_lo); } else #endif PUTC(context->kanji_buf); context->state = S_text; } /* ** Handle character based on context->state. */ switch(context->state) { case S_in_kanji: /* ** Note that if we don't have a CJK input, then this ** is not the second byte of a CJK di-byte, and we're ** trashing the input. That's why 8-bit characters ** followed by, for example, '<' can cause the tag to ** be treated as text, not markup. We could try to deal ** with it by holding each first byte and then checking ** byte pairs, but that doesn't seem worth the overhead ** (see below). - FM */ context->state = S_text; PUTC(context->kanji_buf); PUTC(c); break; case S_tagname_slash: /* * We had something link "slashedtag as as a flag; except if we get * '>' directly after the "slashedtag, in which case keep state as * is and let code below deal with it. - kw */ if (!(c == '>' && context->slashedtag && TOASCII(unsign_c) < 127)) { context->state = S_text; } /* fall through in any case! */ case S_text: if (HTCJK != NOCJK && (TOASCII(c) & 0200) != 0) { /* S/390 -- gil -- 0864 */ /* ** Setting up for Kanji multibyte handling (based on ** Takuya ASADA's (asada@three-a.co.jp) CJK Lynx). ** Note that if the input is not in fact CJK, the ** next byte also will be mishandled, as explained ** above. Toggle raw mode off in such cases, or ** select the "7 bit approximations" display ** character set, which is largely equivalent ** to having raw mode off with CJK. - FM */ context->state = S_in_kanji; context->kanji_buf = c; break; } else if (HTCJK != NOCJK && TOASCII(c) == '\033') { /* S/390 -- gil -- 0881 */ /* ** Setting up for CJK escape sequence handling (based on ** Takuya ASADA's (asada@three-a.co.jp) CJK Lynx). - FM */ context->state = S_esc; PUTC(c); break; } if (c == '&' || c == '<') { #ifdef USE_PRETTYSRC if (psrc_view) { /*there is nothing useful in the element_stack*/ testtag = context->current_tag; } else #endif testtag = context->element_stack ? context->element_stack->tag : NULL; } if (c == '&' && TOASCII(unsign_c) < 127 && /* S/390 -- gil -- 0898 */ (!testtag || (testtag->contents == SGML_MIXED || testtag->contents == SGML_ELEMENT || testtag->contents == SGML_PCDATA || #ifdef USE_PRETTYSRC testtag->contents == SGML_EMPTY || #endif testtag->contents == SGML_RCDATA))) { /* ** Setting up for possible entity, without the leading '&'. - FM */ string->size = 0; context->state = S_ero; } else if (c == '<' && TOASCII(unsign_c) < 127) { /* S/390 -- gil -- 0915 */ /* ** Setting up for possible tag. - FM */ string->size = 0; if (testtag && testtag->contents == SGML_PCDATA) context->state = S_pcdata; else if (testtag && (testtag->contents == SGML_LITTERAL || testtag->contents == SGML_CDATA)) context->state = S_litteral; else context->state = S_tag; context->slashedtag = NULL; } else if (context->slashedtag && (c == '/' || (c == '>' && context->state == S_tagname_slash)) && TOASCII(unsign_c) < 127) { /* ** We got either the second slash of a pending "' of a mere "". In both ** cases generate a "" end tag in the recover buffer for ** reparsing unless NAME is really an empty element. - kw */ #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket); PUTC(c); PSRCSTOP(abracket); } else #endif if (context->slashedtag != context->unknown_tag && !ReallyEmptyTag(context->slashedtag)) { if (context->recover == NULL) { StrAllocCopy(context->recover, "recover_index = 0; } else { StrAllocCat(context->recover, "recover, context->slashedtag->name); StrAllocCat(context->recover, ">"); } context->slashedtag = NULL; } else if (context->element_stack && (context->element_stack->tag->flags & Tgf_frecyc)) { /* * The element stack says we are within the contents of an * element that the next stage (HTML.c) may want to feed * us back again (via the *include string). So try to output * text in UTF-8 if possible, using the same logic as for * attribute values (which should be in line with what * context->current_tag_charset indicates). - kw */ if (context->T.decode_utf8 && *context->utf_buf) { PUTS(context->utf_buf); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } else if (HTCJK == NOCJK && (context->T.output_utf8 || context->T.trans_from_uni)) { if (LYIsASCII(clong)) { PUTC(c); } else if (clong == 0xfffd && saved_char_in && HTPassEightBitRaw && (unsigned char)saved_char_in >= LYlowest_eightbit[context->outUCLYhndl]) { PUTUTF8((0xf000 | (unsigned char)saved_char_in)); } else { PUTUTF8(clong); } } else if (saved_char_in && context->T.use_raw_char_in) { PUTC(saved_char_in); } else { PUTC(c); } #define PASS8859SPECL context->T.pass_160_173_raw /* ** Convert 160 (nbsp) to Lynx special character if ** neither HTPassHighCtrlRaw nor HTCJK is set. - FM */ } else if (unsign_c == CH_NBSP && /* S/390 -- gil -- 0932 */ !context->no_lynx_specialcodes && !(PASS8859SPECL || HTCJK != NOCJK)) { PUTC(HT_NON_BREAK_SPACE); /* ** Convert 173 (shy) to Lynx special character if ** neither HTPassHighCtrlRaw nor HTCJK is set. - FM */ } else if (unsign_c == CH_SHY && /* S/390 -- gil -- 0949 */ !context->no_lynx_specialcodes && !(PASS8859SPECL || HTCJK != NOCJK)) { PUTC(LY_SOFT_HYPHEN); /* ** Handle the case in which we think we have a character ** which doesn't need further processing (e.g., a koi8-r ** input for a koi8-r output). - FM */ } else if (context->T.use_raw_char_in && saved_char_in) { /* ** Only if the original character is still in saved_char_in, ** otherwise we may be iterating from a goto top. - KW */ PUTC(saved_char_in); saved_char_in = '\0'; /****************************************************************** * I. LATIN-1 OR UCS2 TO DISPLAY CHARSET ******************************************************************/ } else if ((chk = (BOOL) (context->T.trans_from_uni && TOASCII(unsign_c) >= 160)) && /* S/390 -- gil -- 0968 */ (uck = UCTransUniChar(unsign_c, context->outUCLYhndl)) >= ' ' && uck < 256) { CTRACE((tfp, "UCTransUniChar returned 0x%.2lX:'%c'.\n", uck, FROMASCII((char)uck))); /* ** We got one octet from the conversions, so use it. - FM */ PUTC(FROMASCII((char)uck)); } else if ((chk && (uck == -4 || (context->T.repl_translated_C0 && uck > 0 && uck < 32))) && /* ** Not found; look for replacement string. - KW */ (uck = UCTransUniCharStr(replace_buf, 60, clong, context->outUCLYhndl, 0) >= 0)) { /* ** Got a replacement string. ** No further tests for validity - assume that whoever ** defined replacement strings knew what she was doing. - KW */ for (p = replace_buf; *p; p++) PUTC(*p); /* ** If we're displaying UTF-8, try that now. - FM */ } else if (context->T.output_utf8 && PUTUTF8(clong)) { ; /* do nothing more */ /* ** If it's any other (> 160) 8-bit character, and ** we have not set HTPassEightBitRaw nor HTCJK, nor ** have the "ISO Latin 1" character set selected, ** back translate for our character set. - FM */ #define IncludesLatin1Enc \ (context->outUCLYhndl == LATIN1 || \ (context->outUCI && \ (context->outUCI->enc & (UCT_CP_SUPERSETOF_LAT1)))) #define PASSHI8BIT (HTPassEightBitRaw || \ (context->T.do_8bitraw && !context->T.trans_from_uni)) } else if (unsign_c > 160 && unsign_c < 256 && !(PASSHI8BIT || HTCJK != NOCJK) && !IncludesLatin1Enc) { int i; #ifdef USE_PRETTYSRC int psrc_view_backup = 0; #endif string->size = 0; EntityName = HTMLGetEntityName((int)(unsign_c - 160)); for (i = 0; EntityName[i]; i++) HTChunkPutc(string, EntityName[i]); HTChunkTerminate(string); #ifdef USE_PRETTYSRC /* we need to disable it temporary*/ if (psrc_view) { psrc_view_backup =1; psrc_view =0; } #endif handle_entity(context, '\0'); #ifdef USE_PRETTYSRC /* we need to disable it temporary*/ if (psrc_view_backup) psrc_view = TRUE; #endif string->size = 0; if (!FoundEntity) PUTC(';'); /* ** If we get to here and have an ASCII char, ** pass the character. - KW */ } else if (TOASCII(unsign_c) < 127 && unsign_c > 0) { /* S/390 -- gil -- 0987 */ PUTC(c); /* ** If we get to here, and should have translated, ** translation has failed so far. - KW ** ** We should have sent UTF-8 output to the parser ** already, but what the heck, try again. - FM */ } else if (context->T.output_utf8 && *context->utf_buf) { for (p = context->utf_buf; *p; p++) PUTC(*p); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; #ifdef NOTDEFINED /* ** Check for a strippable koi8-r 8-bit character. - FM */ } else if (context->T.strip_raw_char_in && saved_char_in && ((unsigned char)saved_char_in >= 0xc0) && ((unsigned char)saved_char_in < 255)) { /* ** KOI8 special: strip high bit, gives (somewhat) readable ** ASCII or KOI7 - it was constructed that way! - KW */ PUTC(((char)(saved_char_in & 0x7f))); saved_char_in = '\0'; #endif /* NOTDEFINED */ /* ** If we don't actually want the character, ** make it safe and output that now. - FM */ } else if (TOASCII((unsigned char)c) < /* S/390 -- gil -- 0997 */ LYlowest_eightbit[context->outUCLYhndl] || (context->T.trans_from_uni && !HTPassEightBitRaw)) { #ifdef NOTUSED_FOTEMODS /* ** If we do not have the "7-bit approximations" as our ** output character set (in which case we did it already) ** seek a translation for that. Otherwise, or if the ** translation fails, use UHHH notation. - FM */ if ((chk = (context->outUCLYhndl != UCGetLYhndl_byMIME("us-ascii"))) && (uck = UCTransUniChar(unsign_c, UCGetLYhndl_byMIME("us-ascii"))) >= ' ' && TOASCII(uck) < 127) { /* S/390 -- gil -- 1008 */ /* ** Got an ASCII character (yippey). - FM */ PUTC(((char)FROMASCII(TOASCII(uck) & 0xff))); } else if ((chk && uck == -4) && (uck = UCTransUniCharStr(replace_buf, 60, clong, UCGetLYhndl_byMIME("us-ascii"), 0) >= 0)) { /* ** Got a replacement string (yippey). - FM */ for (p = replace_buf; *p; p++) PUTC(*p); } else { #endif /* NOTUSED_FOTEMODS */ /* ** Out of luck, so use the UHHH notation (ugh). - FM */ /* S/390 -- gil -- 1018 */ /* do not print UHHH for now sprintf(replace_buf, "U%.2lX", TOASCII(unsign_c)); for (p = replace_buf; *p; p++) { PUTC(*p); } */ #ifdef NOTUSED_FOTEMODS } #endif /* NOTUSED_FOTEMODS */ /* ** If we get to here, pass the character. - FM */ } else { PUTC(c); } break; /* ** Found '<' in SGML_PCDATA content; treat this mode nearly like ** S_litteral, but recognize 'size && TOASCII(unsign_c) < 127) { /* first after '<' */ if (c == '!') { /* state = S_exclamation; context->lead_exclamation = TRUE; context->doctype_bracket = FALSE; context->first_bracket = FALSE; HTChunkPutc(string, c); break; } else if (c == '?') { /* ' - kw */ CTRACE((tfp, "SGML: Found PI in PCDATA, junking it until '>'\n")); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket);PUTS("seen_nonwhite_in_junk_tag = TRUE; /* show all */ } #endif context->state = S_junk_pi; break; } } /* Fall through to S_litteral - kw */ /* ** In litteral mode, waits only for specific end tag (for ** compatibility with old servers, and for Lynx). - FM */ case S_litteral: /*PSRC:this case not understood completely by HV, not done*/ HTChunkPutc(string, c); #ifdef USE_PRETTYSRC if (psrc_view) { /*there is nothing useful in the element_stack*/ testtag = context->current_tag; } else #endif testtag = context->element_stack ? context->element_stack->tag : NULL; if (TOUPPER(c) != ((string->size == 1) ? '/' : testtag->name[string->size-2])) { int i; /* ** If complete match, end litteral. */ if ((c == '>') && testtag && !testtag->name[string->size-2]) { #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket);PUTC('<');PUTC('/');PSRCSTOP(abracket); PSRCSTART(tag); strcpy(string->data,context->current_tag->name); if (tagname_transform != 1) { if (tagname_transform == 0) LYLowerCase(string->data); else LYUpperCase(string->data); } PUTS(string->data); PSRCSTOP(tag); PSRCSTART(abracket);PUTC('>');PSRCSTOP(abracket); context->current_tag = NULL; string->size = 0; context->current_attribute_number = INVALID; context->state = S_text; break; } #endif end_element(context, context->element_stack->tag); string->size = 0; context->current_attribute_number = INVALID; context->state = S_text; break; } if (((testtag->contents != SGML_LITTERAL && (testtag->flags & Tgf_strict)) || (context->state == S_pcdata && (testtag->flags & (Tgf_strict|Tgf_endO)))) && (string->size > 1 && (c == '>' || string->size > 2 || IsNmStart(c)))) { context->state = S_end; string->size--; for (i = 0; i < string->size; i++) /* remove '/' */ string->data[i] = string->data[i+1]; if ((string->size == 1) ? IsNmStart(c) : IsNmChar(c)) break; string->size--; goto top1; } if (context->state == S_pcdata && (testtag->flags & (Tgf_strict|Tgf_endO)) && (string->size == 1 && IsNmStart(c))) { context->state = S_tag; break; } /* ** If Mismatch: recover string literally. */ PUTC('<'); for (i = 0; i < string->size-1; i++) /* recover, except last c */ PUTC(string->data[i]); string->size = 0; context->state = S_text; goto top1; /* to recover last c */ } break; /* ** Character reference (numeric entity) or named entity. */ case S_ero: if (c == '#') { /* ** Setting up for possible numeric entity. */ context->state = S_cro; /* &# is Char Ref Open */ break; } context->state = S_entity; /* Fall through! */ /* ** Handle possible named entity. */ case S_entity: if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1029 */ isalnum((unsigned char)c) : isalpha((unsigned char)c))) { /* Should probably use IsNmStart/IsNmChar above (is that right?), but the world is not ready for that - there's  : (note colon!) and stuff around. */ /* ** Accept valid ASCII character. - FM */ HTChunkPutc(string, c); } else if (string->size == 0) { /* ** It was an ampersand that's just text, so output ** the ampersand and recycle this character. - FM */ #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTART(badseq); #endif PUTC('&'); #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTOP(badseq); #endif context->state = S_text; goto top1; } else { /* ** Terminate entity name and try to handle it. - FM */ HTChunkTerminate(string); #ifdef USE_PRETTYSRC entity_string = string->data; #endif /* S/390 -- gil -- 1039 */ /* CTRACE((tfp, "%s: %d: %s\n", __FILE__, __LINE__, string->data)); */ if (!strcmp(string->data, "zwnj") && (!context->element_stack || (context->element_stack->tag && context->element_stack->tag->contents == SGML_MIXED))) { /* ** Handle zwnj (8204) as . - FM */ char temp[8]; CTRACE((tfp, "SGML_character: Handling 'zwnj' entity as 'WBR' element.\n")); if (c != ';') { sprintf(temp, "%c", c); } else { sprintf(temp, ""); } if (context->recover == NULL) { StrAllocCopy(context->recover, temp); context->recover_index = 0; } else { StrAllocCat(context->recover, temp); } string->size = 0; context->state = S_text; break; } else { handle_entity(context, '\0'); } string->size = 0; context->state = S_text; /* ** Don't eat the terminator if we didn't find the ** entity name and therefore sent the raw string ** via handle_entity(), or if the terminator is ** not the "standard" semi-colon for HTML. - FM */ #ifdef USE_PRETTYSRC if (psrc_view && FoundEntity && c == ';') { HTMLSRC_apply_markup(context,HTL_entity, START); PUTC(c); HTMLSRC_apply_markup(context,HTL_entity, STOP); } #endif if (!FoundEntity || c != ';') goto top1; } break; /* ** Check for a numeric entity. */ case S_cro: if (TOASCII(unsign_c) < 127 && TOLOWER((unsigned char)c) == 'x') { /* S/390 -- gil -- 1060 */ context->isHex = TRUE; context->state = S_incro; } else if (TOASCII(unsign_c) < 127 && isdigit((unsigned char)c)) { /* ** Accept only valid ASCII digits. - FM */ HTChunkPutc(string, c); /* accumulate a character NUMBER */ context->isHex = FALSE; context->state = S_incro; } else if (string->size == 0) { /* ** No 'x' or digit following the "&#" so recover ** them and recycle the character. - FM */ #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTART(badseq); #endif PUTC('&'); PUTC('#'); #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTOP(badseq); #endif context->state = S_text; goto top1; } break; /* ** Handle a numeric entity. */ case S_incro: /* S/390 -- gil -- 1075 */ /* CTRACE((tfp, "%s: %d: numeric %d %d\n", __FILE__, __LINE__, unsign_c, c)); */ if ((TOASCII(unsign_c) < 127) && (context->isHex ? isxdigit((unsigned char)c) : isdigit((unsigned char)c))) { /* ** Accept only valid hex or ASCII digits. - FM */ HTChunkPutc(string, c); /* accumulate a character NUMBER */ } else if (string->size == 0) { /* ** No hex digit following the "&#x" so recover ** them and recycle the character. - FM */ #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTART(badseq); #endif PUTC('&'); PUTC('#'); PUTC('x'); #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTOP(badseq); #endif context->isHex = FALSE; context->state = S_text; goto top1; } else { /* ** Terminate the numeric entity and try to handle it. - FM */ UCode_t code; int i; HTChunkTerminate(string); #ifdef USE_PRETTYSRC entity_string = string->data; #endif if ((context->isHex ? sscanf(string->data, "%lx", &code) : sscanf(string->data, "%ld", &code)) == 1) { /* =============== work in ASCII below here =============== S/390 -- gil -- 1092 */ if ((code == 1) || (code > 127 && code < 156)) { /* ** Assume these are Microsoft code points, ** inflicted on us by FrontPage. - FM ** ** MS FrontPage uses syntax like ™ in 128-159 range ** and doesn't follow Unicode standards for this area. ** Windows-1252 codepoints are assumed here. */ switch (code) { case 1: /* ** WHITE SMILING FACE */ code = 0x263a; break; case 128: /* ** EURO currency sign */ code = 0x20ac; break; case 130: /* ** SINGLE LOW-9 QUOTATION MARK (sbquo) */ code = 0x201a; break; case 132: /* ** DOUBLE LOW-9 QUOTATION MARK (bdquo) */ code = 0x201e; break; case 133: /* ** HORIZONTAL ELLIPSIS (hellip) */ code = 0x2026; break; case 134: /* ** DAGGER (dagger) */ code = 0x2020; break; case 135: /* ** DOUBLE DAGGER (Dagger) */ code = 0x2021; break; case 137: /* ** PER MILLE SIGN (permil) */ code = 0x2030; break; case 139: /* ** SINGLE LEFT-POINTING ANGLE QUOTATION MARK ** (lsaquo) */ code = 0x2039; break; case 145: /* ** LEFT SINGLE QUOTATION MARK (lsquo) */ code = 0x2018; break; case 146: /* ** RIGHT SINGLE QUOTATION MARK (rsquo) */ code = 0x2019; break; case 147: /* ** LEFT DOUBLE QUOTATION MARK (ldquo) */ code = 0x201c; break; case 148: /* ** RIGHT DOUBLE QUOTATION MARK (rdquo) */ code = 0x201d; break; case 149: /* ** BULLET (bull) */ code = 0x2022; break; case 150: /* ** EN DASH (ndash) */ code = 0x2013; break; case 151: /* ** EM DASH (mdash) */ code = 0x2014; break; case 152: /* ** SMALL TILDE (tilde) */ code = 0x02dc; break; case 153: /* ** TRADE MARK SIGN (trade) */ code = 0x2122; break; case 155: /* ** SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ** (rsaquo) */ code = 0x203a; break; default: /* ** Do not attempt a conversion ** to valid Unicode values. */ break; } } /* ** Check for special values. - FM */ if ((code == 8204) && (!context->element_stack || (context->element_stack->tag && context->element_stack->tag->contents == SGML_MIXED))) { /* ** Handle zwnj (8204) as . - FM */ char temp[8]; CTRACE((tfp, "SGML_character: Handling '8204' (zwnj) reference as 'WBR' element.\n")); /* ** Include the terminator if it is not ** the standard semi-colon. - FM */ if (c != ';') { sprintf(temp, "%c", c); } else { sprintf(temp, ""); } /* ** Add the replacement string to the ** recover buffer for processing. - FM */ if (context->recover == NULL) { StrAllocCopy(context->recover, temp); context->recover_index = 0; } else { StrAllocCat(context->recover, temp); } string->size = 0; context->isHex = FALSE; context->state = S_text; break; } else if (put_special_unicodes(context, code)) { /* ** We handled the value as a special character, ** so recycle the terminator or break. - FM */ #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(entity); PUTS( (context->isHex ? "&#x" : "&#") ); PUTS(entity_string); if (c == ';') PUTC(';'); PSRCSTOP(entity); } #endif string->size = 0; context->isHex = FALSE; context->state = S_text; if (c != ';') goto top1; break; } /* ** Seek a translation from the chartrans tables. */ if ((uck = UCTransUniChar(code, context->outUCLYhndl)) >= 32 && uck < 256 && (uck < 127 || uck >= LYlowest_eightbit[context->outUCLYhndl])) { #ifdef USE_PRETTYSRC if (!psrc_view) { #endif PUTC(FROMASCII((char)uck)); #ifdef USE_PRETTYSRC } else { PSRCSTART(entity); PUTS( (context->isHex ? "&#x" : "&#") ); PUTS(entity_string); PUTC(';'); PSRCSTOP(entity); } #endif } else if ((uck == -4 || (context->T.repl_translated_C0 && uck > 0 && uck < 32)) && /* ** Not found; look for replacement string. */ (uck = UCTransUniCharStr(replace_buf, 60, code, context->outUCLYhndl, 0) >= 0)) { #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(entity); PUTS( (context->isHex ? "&#x" : "&#") ); PUTS(entity_string); PUTC(';'); PSRCSTOP(entity); } else #endif for (p = replace_buf; *p; p++) { PUTC(*p); } /* ** If we're displaying UTF-8, try that now. - FM */ } else if (context->T.output_utf8 && PUTUTF8(code)) { ; /* do nothing more */ #ifdef NOTUSED_FOTEMODS /* ** If the value is greater than 255 and we do not ** have the "7-bit approximations" as our output ** character set (in which case we did it already) ** seek a translation for that. - FM */ } else if ((chk = ((code > 255) && context->outUCLYhndl != UCGetLYhndl_byMIME("us-ascii"))) && (uck = UCTransUniChar(code, UCGetLYhndl_byMIME("us-ascii"))) >= ' ' && uck < 127) { /* ** Got an ASCII character (yippey). - FM */ #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(entity); PUTS( (context->isHex ? "&#x" : "&#") ); PUTS(entity_string); PUTC(';'); PSRCSTOP(entity); } else #endif PUTC(((char)FROMASCII(uck & 0xff))); /* =============== work in ASCII above here =============== S/390 -- gil -- 1118 */ } else if ((chk && uck == -4) && (uck = UCTransUniCharStr(replace_buf, 60, code, UCGetLYhndl_byMIME("us-ascii"), 0) >= 0)) { /* ** Got a replacement string (yippey). - FM */ #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(entity); PUTS( (context->isHex ? "&#x" : "&#") ); PUTS(entity_string); PUTC(';'); PSRCSTOP(entity); } else #endif for (p = replace_buf; *p; p++) PUTC(*p); #endif /* NOTUSED_FOTEMODS */ /* ** Ignore 8205 (zwj), ** 8206 (lrm), and 8207 (rln), if we get to here. - FM */ } else if (code == 8205 || code == 8206 || code == 8207) { if (TRACE) { string->size--; LYstrncpy(replace_buf, string->data, (string->size < 64 ? string->size : 63)); fprintf(tfp, "SGML_character: Ignoring '%s%s'.\n", (context->isHex ? "&#x" : "&#"), replace_buf); } #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(badseq); PUTS( (context->isHex ? "&#x" : "&#") ); PUTS(entity_string); if (c == ';') PUTC(';'); PSRCSTOP(badseq); } #endif string->size = 0; context->isHex = FALSE; context->state = S_text; if (c != ';') goto top1; break; /* ** Show the numeric entity if we get to here ** and the value: ** (1) Is greater than 255 (but use ASCII characters ** for spaces or dashes). ** (2) Is less than 32, and not valid or we don't ** have HTCJK set. ** (3) Is 127 and we don't have HTPassHighCtrlRaw or ** HTCJK set. ** (4) Is 128 - 159 and we don't have HTPassHighCtrlNum ** set. ** - FM */ } else if ((code > 255) || (code < ' ' && /* S/390 -- gil -- 1140 */ code != '\t' && code != '\n' && code != '\r' && HTCJK == NOCJK) || (TOASCII(code) == 127 && !(HTPassHighCtrlRaw || HTCJK != NOCJK)) || (TOASCII(code) > 127 && code < 160 && !HTPassHighCtrlNum)) { /* ** Unhandled or illegal value. Recover the ** "&#" or "&#x" and digit(s), and recycle ** the terminator. - FM */ #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(badseq); } #endif PUTC('&'); PUTC('#'); if (context->isHex) { PUTC('x'); context->isHex = FALSE; } string->size--; for (i = 0; i < string->size; i++) /* recover */ PUTC(string->data[i]); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTOP(badseq); } #endif string->size = 0; context->isHex = FALSE; context->state = S_text; goto top1; } else if (TOASCII(code) < 161 || /* S/390 -- gil -- 1162 */ HTPassEightBitNum || IncludesLatin1Enc) { /* ** No conversion needed. - FM */ #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(entity); PUTS( (context->isHex ? "&#x" : "&#") ); PUTS(entity_string); PUTC(';'); PSRCSTOP(entity); } else #endif PUTC(FROMASCII((char)code)); } else { /* ** Handle as named entity. - FM */ code -= 160; EntityName = HTMLGetEntityName(code); if (EntityName && EntityName[0] != '\0') { string->size = 0; for (i = 0; EntityName[i]; i++) HTChunkPutc(string, EntityName[i]); HTChunkTerminate(string); handle_entity(context, '\0'); /* ** Add a semi-colon if something went wrong ** and handle_entity() sent the string. - FM */ if (!FoundEntity) { PUTC(';'); } } else { /* ** Our conversion failed, so recover the "&#" ** and digit(s), and recycle the terminator. - FM */ #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTART(badseq); #endif PUTC('&'); PUTC('#'); if (context->isHex) { PUTC('x'); context->isHex = FALSE; } string->size--; for (i = 0; i < string->size; i++) /* recover */ PUTC(string->data[i]); #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTOP(badseq); #endif string->size = 0; context->isHex = FALSE; context->state = S_text; goto top1; } } /* ** If we get to here, we succeeded. Hoorah!!! - FM */ string->size = 0; context->isHex = FALSE; context->state = S_text; /* ** Don't eat the terminator if it's not ** the "standard" semi-colon for HTML. - FM */ if (c != ';') { goto top1; } } else { /* ** Not an entity, and don't know why not, so add ** the terminator to the string, output the "&#" ** or "&#x", and process the string via the recover ** element. - FM */ string->size--; HTChunkPutc(string, c); HTChunkTerminate(string); #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTART(badseq); #endif PUTC('&'); PUTC('#'); #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTOP(badseq); #endif if (context->isHex) { PUTC('x'); context->isHex = FALSE; } if (context->recover == NULL) { StrAllocCopy(context->recover, string->data); context->recover_index = 0; } else { StrAllocCat(context->recover, string->data); } string->size = 0; context->isHex = FALSE; context->state = S_text; break; } } break; /* ** Tag */ case S_tag: /* new tag */ if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1179 */ IsNmChar(c) : IsNmStart(c))) { /* ** Add valid ASCII character. - FM */ HTChunkPutc(string, c); } else if (c == '!' && !string->size) { /* state = S_exclamation; context->lead_exclamation = TRUE; context->doctype_bracket = FALSE; context->first_bracket = FALSE; HTChunkPutc(string, c); break; } else if (!string->size && (TOASCII(unsign_c) <= 160 && /* S/390 -- gil -- 1196 */ (c != '/' && c != '?' && c != '_' && c != ':'))) { /* ** '<' must be followed by an ASCII letter to be a valid ** start tag. Here it isn't, nor do we have a '/' for an ** end tag, nor one of some other characters with a ** special meaning for SGML or which are likely to be legal ** Name Start characters in XML or some other extension. ** So recover the '<' and following character as data. - FM & KW */ context->state = S_text; #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTART(badseq); #endif PUTC('<'); #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTOP(badseq); #endif goto top1; } else { /* End of tag name */ /* ** Try to handle tag. - FM */ HTTag * t; if (c == '/') { if (string->size == 0) { context->state = S_end; break; } CTRACE((tfp,"SGML: `<%.*s/' found!\n", string->size, string->data)); } HTChunkTerminate(string) ; t = SGMLFindTag(dtd, string->data); if (t == context->unknown_tag && ((c == ':' && string->size == 4 && 0 == strcasecomp(string->data, "URL")) || (string->size > 4 && 0 == strncasecomp(string->data, "URL:", 4)))) { /* ** Treat data); /* recover */ PUTC(c); #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTOP(badseq); #endif CTRACE((tfp, "SGML: Treating <%s%c as text\n", string->data, c)); string->size = 0; context->state = S_text; break; } if (c == '/' && t) { /* * Element name was ended by '/'. Remember the tag that * ended thusly, we'll interpret this as either an indication * of an empty element (if '>' follows directly) or do * some SGMLshortref-ish treatment. - kw */ context->slashedtag = t; } if (!t) { if (c == '?' && string->size <= 1) { CTRACE((tfp, "SGML: Found PI, junking it until '>'\n")); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket);PUTS("seen_nonwhite_in_junk_tag = TRUE; /*show all*/ } #endif context->state = S_junk_pi; break; } CTRACE((tfp, "SGML: *** Invalid element %s\n", string->data)); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket);PUTC('<');PSRCSTOP(abracket); PSRCSTART(badtag); if (tagname_transform != 1) { if (tagname_transform == 0) LYLowerCase(string->data); else LYUpperCase(string->data); } PUTS(string->data); if (c == '>' ) { PSRCSTOP(badtag); PSRCSTART(abracket);PUTC('>');PSRCSTOP(abracket); } else { PUTC(c); } } #endif context->state = (c == '>') ? S_text : S_junk_tag; break; } else if (t == context->unknown_tag) { CTRACE((tfp, "SGML: *** Unknown element %s\n", string->data)); /* ** Fall through and treat like valid ** tag for attribute parsing. - KW */ } context->current_tag = t; #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket);PUTC('<');PSRCSTOP(abracket); if (t != context->unknown_tag) PSRCSTART(tag); else PSRCSTART(badtag); if (tagname_transform != 1) { if (tagname_transform == 0) LYLowerCase(string->data); else LYUpperCase(string->data); } PUTS(string->data); if (t != context->unknown_tag) PSRCSTOP(tag); else PSRCSTOP(badtag); } if (!psrc_view) /*don't waste time */ #endif { /* ** Clear out attributes. */ memset( (void*)context->present, 0 , sizeof(BOOL)* context->current_tag->number_of_attributes); } string->size = 0; context->current_attribute_number = INVALID; #ifdef USE_PRETTYSRC if (psrc_view) { if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) { if (c != '<') { PSRCSTART(abracket); PUTC(c); PSRCSTOP(abracket); context->state = (c == '>') ? S_text : S_tagname_slash; } else context->state = S_tag; } else { if (!WHITE(c)) PUTC(c); context->state = S_tag_gap; } } else #endif if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) { if (context->current_tag->name) start_element(context); context->state = (c == '>') ? S_text : (c == '<') ? S_tag : S_tagname_slash; } else { context->state = S_tag_gap; } } break; case S_exclamation: if (context->lead_exclamation && c == '-') { /* ** Set up for possible comment. - FM */ context->lead_exclamation = FALSE; context->first_dash = TRUE; HTChunkPutc(string, c); break; } if (context->lead_exclamation && c == '[') { /* ** Set up for possible marked section. - FM */ context->lead_exclamation = FALSE; context->first_bracket = TRUE; context->second_bracket = FALSE; HTChunkPutc(string, c); context->state = S_marked; break; } if (context->first_dash && c == '-') { /* ** Set up to handle comment. - FM */ context->lead_exclamation = FALSE; context->first_dash = FALSE; context->end_comment = FALSE; HTChunkPutc(string, c); context->state = S_comment; break; } context->lead_exclamation = FALSE; context->first_dash = FALSE; if (c == '>') { /* ** Try to handle identifier. - FM */ HTChunkTerminate(string); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(sgmlspecial); PUTC('<'); PUTS(string->data); PUTC('>'); PSRCSTOP(sgmlspecial); } else #endif handle_identifier(context); string->size = 0; context->state = S_text; break; } if (WHITE(c)) { if (string->size == 8 && !strncasecomp(string->data, "!DOCTYPE", 8)) { /* ** Set up for DOCTYPE declaration. - FM */ HTChunkPutc(string, c); context->doctype_bracket = FALSE; context->state = S_doctype; break; } if (string->size == 7 && !strncasecomp(string->data, "!ENTITY", 7)) { /* ** Set up for ENTITY declaration. - FM */ HTChunkPutc(string, c); context->first_dash = FALSE; context->end_comment = TRUE; context->state = S_sgmlent; break; } if (string->size == 8 && !strncasecomp(string->data, "!ELEMENT", 8)) { /* ** Set up for ELEMENT declaration. - FM */ HTChunkPutc(string, c); context->first_dash = FALSE; context->end_comment = TRUE; context->state = S_sgmlele; break; } if (string->size == 8 && !strncasecomp(string->data, "!ATTLIST", 8)) { /* ** Set up for ATTLIST declaration. - FM */ HTChunkPutc(string, c); context->first_dash = FALSE; context->end_comment = TRUE; context->state = S_sgmlatt; break; } } HTChunkPutc(string, c); break; case S_comment: /* Expecting comment. - FM */ if (historical_comments) { /* ** Any '>' terminates. - FM */ if (c == '>') { HTChunkTerminate(string); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(comm); PUTC('<'); PUTS_TR(string->data); PUTC('>'); PSRCSTOP(comm); } else #endif handle_comment(context); string->size = 0; context->end_comment = FALSE; context->first_dash = FALSE; context->state = S_text; break; } goto S_comment_put_c; } if (!context->first_dash && c == '-') { HTChunkPutc(string, c); context->first_dash = TRUE; break; } if (context->first_dash && c == '-') { HTChunkPutc(string, c); context->first_dash = FALSE; if (!context->end_comment) context->end_comment = TRUE; else if (!minimal_comments) /* ** Validly treat '--' pairs as successive comments ** (for minimal, any "--WHITE>" terminates). - FM */ context->end_comment = FALSE; break; } if (context->end_comment && c == '>') { /* ** Terminate and handle the comment. - FM */ HTChunkTerminate(string); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(comm); PUTC('<'); PUTS_TR(string->data); PUTC('>'); PSRCSTOP(comm); } else #endif handle_comment(context); string->size = 0; context->end_comment = FALSE; context->first_dash = FALSE; context->state = S_text; break; } context->first_dash = FALSE; if (context->end_comment && !isspace(c)) context->end_comment = FALSE; S_comment_put_c: if (context->T.decode_utf8 && *context->utf_buf) { HTChunkPuts(string, context->utf_buf); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } else if (HTCJK == NOCJK && (context->T.output_utf8 || context->T.trans_from_uni)) { if (clong == 0xfffd && saved_char_in && HTPassEightBitRaw && (unsigned char)saved_char_in >= LYlowest_eightbit[context->outUCLYhndl]) { HTChunkPutUtf8Char(string, (0xf000 | (unsigned char)saved_char_in)); } else { HTChunkPutUtf8Char(string, clong); } } else if (saved_char_in && context->T.use_raw_char_in) { HTChunkPutc(string, saved_char_in); } else { HTChunkPutc(string, c); } break; case S_doctype: /* Expecting DOCTYPE. - FM */ if (context->doctype_bracket) { HTChunkPutc(string, c); if (c == ']') context->doctype_bracket = FALSE; break; } if (c == '[' && WHITE(string->data[string->size - 1])) { HTChunkPutc(string, c); context->doctype_bracket = TRUE; break; } if (c == '>') { HTChunkTerminate(string); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(sgmlspecial); PUTC('<'); PUTS(string->data); PUTC('>'); PSRCSTOP(sgmlspecial); } else #endif handle_doctype(context); string->size = 0; context->state = S_text; break; } HTChunkPutc(string, c); break; case S_marked: /* Expecting marked section. - FM */ if (context->first_bracket && c == '[') { HTChunkPutc(string, c); context->first_bracket = FALSE; context->second_bracket = TRUE; break; } if (context->second_bracket && c == ']' && string->data[string->size - 1] == ']') { HTChunkPutc(string, c); context->second_bracket = FALSE; break; } if (!context->second_bracket && c == '>') { HTChunkTerminate(string); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(sgmlspecial); PUTC('<'); PUTS(string->data); PUTC('>'); PSRCSTOP(sgmlspecial); } else #endif handle_marked(context); string->size = 0; context->state = S_text; break; } HTChunkPutc(string, c); break; case S_sgmlent: /* Expecting ENTITY. - FM */ if (!context->first_dash && c == '-') { HTChunkPutc(string, c); context->first_dash = TRUE; break; } if (context->first_dash && c == '-') { HTChunkPutc(string, c); context->first_dash = FALSE; if (!context->end_comment) context->end_comment = TRUE; else context->end_comment = FALSE; break; } if (context->end_comment && c == '>') { HTChunkTerminate(string); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(sgmlspecial); PUTC('<'); PUTS(string->data); PUTC('>'); PSRCSTOP(sgmlspecial); } else #endif handle_sgmlent(context); string->size = 0; context->end_comment = FALSE; context->first_dash = FALSE; context->state = S_text; break; } context->first_dash = FALSE; HTChunkPutc(string, c); break; case S_sgmlele: /* Expecting ELEMENT. - FM */ if (!context->first_dash && c == '-') { HTChunkPutc(string, c); context->first_dash = TRUE; break; } if (context->first_dash && c == '-') { HTChunkPutc(string, c); context->first_dash = FALSE; if (!context->end_comment) context->end_comment = TRUE; else context->end_comment = FALSE; break; } if (context->end_comment && c == '>') { HTChunkTerminate(string); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(sgmlspecial); PUTC('<'); PUTS(string->data); PUTC('>'); PSRCSTOP(sgmlspecial); } else #endif handle_sgmlele(context); string->size = 0; context->end_comment = FALSE; context->first_dash = FALSE; context->state = S_text; break; } context->first_dash = FALSE; HTChunkPutc(string, c); break; case S_sgmlatt: /* Expecting ATTLIST. - FM */ if (!context->first_dash && c == '-') { HTChunkPutc(string, c); context->first_dash = TRUE; break; } if (context->first_dash && c == '-') { HTChunkPutc(string, c); context->first_dash = FALSE; if (!context->end_comment) context->end_comment = TRUE; else context->end_comment = FALSE; break; } if (context->end_comment && c == '>') { HTChunkTerminate(string); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(sgmlspecial); PUTC('<'); PUTS(string->data); PUTC('>'); PSRCSTOP(sgmlspecial); } else #endif handle_sgmlatt(context); string->size = 0; context->end_comment = FALSE; context->first_dash = FALSE; context->state = S_text; break; } context->first_dash = FALSE; HTChunkPutc(string, c); break; case S_tag_gap: /* Expecting attribute or '>' */ if (WHITE(c)) break; /* Gap between attributes */ if (c == '>') { /* End of tag */ #ifdef USE_PRETTYSRC if (!psrc_view) #endif if (context->current_tag->name) start_element(context); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket); PUTC('>'); PSRCSTOP(abracket); } #endif context->state = S_text; break; } HTChunkPutc(string, c); context->state = S_attr; /* Get attribute */ break; /* accumulating value */ case S_attr: if (WHITE(c) || (c == '>') || (c == '=')) { /* End of word */ HTChunkTerminate(string); handle_attribute_name(context, string->data); #ifdef USE_PRETTYSRC if (!psrc_view) { #endif string->size = 0; if (c == '>') { /* End of tag */ if (context->current_tag->name) start_element(context); context->state = S_text; break; } #ifdef USE_PRETTYSRC } else { PUTC(' '); if (context->current_attribute_number == INVALID) PSRCSTART(badattr); else PSRCSTART(attrib); if (attrname_transform != 1) { if (attrname_transform == 0) LYLowerCase(string->data); else LYUpperCase(string->data); } PUTS(string->data); if (c == '=' ) PUTC('='); if (context->current_attribute_number == INVALID) PSRCSTOP(badattr); else PSRCSTOP(attrib); if (c == '>') { PSRCSTART(abracket); PUTC('>'); PSRCSTOP(abracket); context->state = S_text; break; } string->size = 0; } #endif context->state = (c == '=' ? S_equals: S_attr_gap); } else { HTChunkPutc(string, c); } break; case S_attr_gap: /* Expecting attribute or '=' or '>' */ if (WHITE(c)) break; /* Gap after attribute */ if (c == '>') { /* End of tag */ #ifdef USE_PRETTYSRC if (psrc_view) { if (context->current_attribute_number == INVALID) PSRCSTOP(badattr); else PSRCSTOP(attrib); PSRCSTART(abracket); PUTC('>'); PSRCSTOP(abracket); } else #endif if (context->current_tag->name) start_element(context); context->state = S_text; break; } else if (c == '=') { #ifdef USE_PRETTYSRC if (psrc_view) { PUTC('='); if (context->current_attribute_number == INVALID) PSRCSTOP(badattr); else PSRCSTOP(attrib); } #endif context->state = S_equals; break; } #ifdef USE_PRETTYSRC /* we are here because this char seemed the beginning of attrname */ if (psrc_view && context->current_attribute_number == INVALID) { PSRCSTOP(badattr); PUTC(' '); } #endif HTChunkPutc(string, c); context->state = S_attr; /* Get next attribute */ break; case S_equals: /* After attr = */ if (WHITE(c)) break; /* Before attribute value */ if (c == '>') { /* End of tag */ CTRACE((tfp, "SGML: found = but no value\n")); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket); PUTC('>'); PSRCSTOP(abracket); } else #endif if (context->current_tag->name) start_element(context); context->state = S_text; break; } else if (c == '\'') { #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(attrval); PUTC(c); } #endif context->state = S_squoted; break; } else if (c == '"') { #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(attrval); PUTC(c); } #endif context->state = S_dquoted; break; } #ifdef USE_PRETTYSRC if (psrc_view) PSRCSTART(attrval); #endif context->state = S_value; /* no break! fall through to S_value and proccess current `c` */ case S_value: if (WHITE(c) || (c == '>')) { /* End of word */ HTChunkTerminate(string) ; #ifdef USE_PRETTYSRC if (psrc_view) { /*PSRCSTART(attrval);*/ if (attr_is_name) { HTStartAnchor(context->target, string->data, NULL); (*context->actions->end_element)( context->target, HTML_A, (char **)&context->include); } else if (attr_is_href) { PSRCSTART(href); HTStartAnchor(context->target,NULL,string->data); } PUTS_TR(string->data); if (attr_is_href) { (*context->actions->end_element)( context->target, HTML_A, (char **)&context->include); PSRCSTOP(href); } PSRCSTOP(attrval); } else #endif #ifdef CJK_EX /* Quick hack. - JH7AYN */ { char jis_buf[512]; if (string->data[0] == '$') { if (string->data[1] == 'B' || string->data[1] == '@') { jis_buf[0] = '\033'; strcpy(jis_buf + 1, string->data); TO_EUC(jis_buf, string->data); } } } #endif handle_attribute_value(context, string->data); string->size = 0; if (c == '>') { /* End of tag */ #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket); PUTC('>'); PSRCSTOP(abracket); } else #endif if (context->current_tag->name) start_element(context); context->state = S_text; break; } else context->state = S_tag_gap; } else if (context->T.decode_utf8 && *context->utf_buf) { HTChunkPuts(string, context->utf_buf); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } else if (HTCJK == NOCJK && (context->T.output_utf8 || context->T.trans_from_uni)) { if (clong == 0xfffd && saved_char_in && HTPassEightBitRaw && (unsigned char)saved_char_in >= LYlowest_eightbit[context->outUCLYhndl]) { HTChunkPutUtf8Char(string, (0xf000 | (unsigned char)saved_char_in)); } else { HTChunkPutUtf8Char(string, clong); } } else if (saved_char_in && context->T.use_raw_char_in) { HTChunkPutc(string, saved_char_in); } else { HTChunkPutc(string, c); } break; case S_squoted: /* Quoted attribute value */ if (c == '\'') { /* End of attribute value */ HTChunkTerminate(string) ; #ifdef USE_PRETTYSRC if (psrc_view) { /*PSRCSTART(attrval);*/ if (attr_is_name) { HTStartAnchor(context->target,string->data,NULL); (*context->actions->end_element)( context->target, HTML_A, (char **)&context->include); } else if (attr_is_href) { PSRCSTART(href); HTStartAnchor(context->target,NULL,string->data); } PUTS_TR(string->data); if (attr_is_href) { (*context->actions->end_element)( context->target, HTML_A, (char **)&context->include); PSRCSTOP(href); } PUTC('\''); PSRCSTOP(attrval); } else #endif handle_attribute_value(context, string->data); string->size = 0; context->state = S_tag_gap; } else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1213 */ /* ** Setting up for possible single quotes in CJK escape ** sequences. - Takuya ASADA (asada@three-a.co.jp) */ context->state = S_esc_sq; HTChunkPutc(string, c); } else if (context->T.decode_utf8 && *context->utf_buf) { HTChunkPuts(string, context->utf_buf); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } else if (HTCJK == NOCJK && (context->T.output_utf8 || context->T.trans_from_uni)) { if (clong == 0xfffd && saved_char_in && HTPassEightBitRaw && (unsigned char)saved_char_in >= LYlowest_eightbit[context->outUCLYhndl]) { HTChunkPutUtf8Char(string, (0xf000 | (unsigned char)saved_char_in)); } else { HTChunkPutUtf8Char(string, clong); } } else if (saved_char_in && context->T.use_raw_char_in) { HTChunkPutc(string, saved_char_in); } else { HTChunkPutc(string, c); } break; case S_dquoted: /* Quoted attribute value */ if (c == '"' || /* Valid end of attribute value */ (soft_dquotes && /* If emulating old Netscape bug, treat '>' */ c == '>')) { /* as a co-terminator of dquoted and tag */ HTChunkTerminate(string) ; #ifdef USE_PRETTYSRC if (psrc_view) { /*PSRCSTART(attrval);*/ if (attr_is_name) { HTStartAnchor(context->target,string->data,NULL); (*context->actions->end_element)( context->target, HTML_A, (char **)&context->include); } else if (attr_is_href) { PSRCSTART(href); HTStartAnchor(context->target,NULL,string->data); } PUTS_TR(string->data); if (attr_is_href) { (*context->actions->end_element)( context->target, HTML_A, (char **)&context->include); PSRCSTOP(href); } PUTC(c); PSRCSTOP(attrval); } else #endif handle_attribute_value(context, string->data); string->size = 0; context->state = S_tag_gap; if (c == '>') /* We emulated the Netscape bug, so we go */ goto top1; /* back and treat it as the tag terminator */ } else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1230 */ /* ** Setting up for possible double quotes in CJK escape ** sequences. - Takuya ASADA (asada@three-a.co.jp) */ context->state = S_esc_dq; HTChunkPutc(string, c); } else if (context->T.decode_utf8 && *context->utf_buf) { HTChunkPuts(string, context->utf_buf); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } else if (HTCJK == NOCJK && (context->T.output_utf8 || context->T.trans_from_uni)) { if (clong == 0xfffd && saved_char_in && HTPassEightBitRaw && (unsigned char)saved_char_in >= LYlowest_eightbit[context->outUCLYhndl]) { HTChunkPutUtf8Char(string, (0xf000 | (unsigned char)saved_char_in)); } else { HTChunkPutUtf8Char(string, clong); } } else if (saved_char_in && context->T.use_raw_char_in) { HTChunkPutc(string, saved_char_in); } else { HTChunkPutc(string, c); } break; case S_end: /* size ? /* S/390 -- gil -- 1247 */ IsNmChar(c) : IsNmStart(c))) { HTChunkPutc(string, c); } else { /* End of end tag name */ HTTag * t = 0; #ifdef USE_PRETTYSRC BOOL psrc_tagname_processed = FALSE; #endif HTChunkTerminate(string); if (!*string->data) { /* Empty end tag */ if (context->element_stack) t = context->element_stack->tag; } else { t = SGMLFindTag(dtd, string->data); } if (!t || t == context->unknown_tag) { CTRACE((tfp, "Unknown end tag \n", string->data)); #ifdef USE_PRETTYSRC if (psrc_view) { PSRCSTART(abracket); PUTC('<'); PUTC('/'); PSRCSTOP(abracket); PSRCSTART(badtag); if (tagname_transform != 1) { if (tagname_transform == 0) LYLowerCase(string->data); else LYUpperCase(string->data); } PUTS(string->data); if (c != '>') { PUTC(c); } else { PSRCSTOP(badtag); PSRCSTART(abracket); PUTC('>'); PSRCSTOP(abracket); } psrc_tagname_processed=TRUE; } } else if (psrc_view) { #endif } else { BOOL tag_OK = (BOOL) (c == '>' || WHITE(c)); #if OPT HTMLElement e = TAGNUM_OF_TAGP(t); int branch = 2; /* it can be 0,1,2*/ #endif context->current_tag = t; if (HAS_ALT_TAGNUM(TAGNUM_OF_TAGP(t)) && context->element_stack && ALT_TAGP(t) == context->element_stack->tag) context->element_stack->tag = NORMAL_TAGP(context->element_stack->tag); #if OPT if (tag_OK #ifdef EXTENDED_HTMLDTD && Old_DTD #endif ) { switch (e) { case HTML_DD: case HTML_DT: case HTML_LI: case HTML_LH : case HTML_TD: case HTML_TH: case HTML_TR: case HTML_THEAD: case HTML_TFOOT : case HTML_TBODY : case HTML_COLGROUP: branch = 0; break; case HTML_A: case HTML_B: case HTML_BLINK: case HTML_CITE: case HTML_EM: case HTML_FONT: case HTML_FORM: case HTML_I: case HTML_P: case HTML_STRONG: case HTML_TT: case HTML_U: branch = 1; break; default: break; } } #endif #ifdef EXTENDED_HTMLDTD /* ** Just handle ALL end tags normally :-) - kw */ if (!Old_DTD) { end_element( context, context->current_tag); } else #endif /* EXTENDED_HTMLDTD */ if (tag_OK && #if OPT (branch == 0) #else (!strcasecomp(string->data, "DD") || !strcasecomp(string->data, "DT") || !strcasecomp(string->data, "LI") || !strcasecomp(string->data, "LH") || !strcasecomp(string->data, "TD") || !strcasecomp(string->data, "TH") || !strcasecomp(string->data, "TR") || !strcasecomp(string->data, "THEAD") || !strcasecomp(string->data, "TFOOT") || !strcasecomp(string->data, "TBODY") || !strcasecomp(string->data, "COLGROUP")) #endif ) { /* ** Don't treat these end tags as invalid, ** nor act on them. - FM */ CTRACE((tfp, "SGML: `data, c)); string->size = 0; context->current_attribute_number = INVALID; if (c != '>') { context->state = S_junk_tag; } else { context->current_tag = NULL; context->state = S_text; } break; } else if (tag_OK && #if OPT (branch == 1) #else (!strcasecomp(string->data, "A") || !strcasecomp(string->data, "B") || !strcasecomp(string->data, "BLINK") || !strcasecomp(string->data, "CITE") || !strcasecomp(string->data, "EM") || !strcasecomp(string->data, "FONT") || !strcasecomp(string->data, "FORM") || !strcasecomp(string->data, "I") || !strcasecomp(string->data, "P") || !strcasecomp(string->data, "STRONG") || !strcasecomp(string->data, "TT") || !strcasecomp(string->data, "U")) #endif ) { /* ** Handle end tags for container elements declared ** as SGML_EMPTY to prevent "expected tag substitution" ** but still processed via HTML_end_element() in HTML.c ** with checks there to avoid throwing the HTML.c stack ** out of whack (Ugh, what a hack! 8-). - FM */ if (context->inSELECT) { /* ** We are in a SELECT block. - FM */ if (strcasecomp(string->data, "FORM")) { /* ** It is not at FORM end tag, so ignore it. - FM */ CTRACE((tfp, "SGML: ***Ignoring end tag in SELECT block.\n", string->data)); } else { /* ** End the SELECT block and then ** handle the FORM end tag. - FM */ CTRACE((tfp, "SGML: ***Faking SELECT end tag before end tag.\n", string->data)); end_element(context, SGMLFindTag(context->dtd, "SELECT")); CTRACE((tfp, "SGML: End \n", string->data)); #ifdef USE_PRETTYSRC if (!psrc_view) /* Don't actually call if viewing psrc - kw */ #endif (*context->actions->end_element) (context->target, TAGNUM_OF_TAGP(context->current_tag), (char **)&context->include); } } else if (!strcasecomp(string->data, "P")) { /* ** Treat a P end tag like a P start tag (Ugh, ** what a hack! 8-). - FM */ CTRACE((tfp, "SGML: `data, c, string->data, c)); { int i; for (i = 0; i < context->current_tag->number_of_attributes; i++) { context->present[i] = NO; } } if (context->current_tag->name) start_element(context); } else { CTRACE((tfp, "SGML: End \n", string->data)); #ifdef USE_PRETTYSRC if (!psrc_view) /* Don't actually call if viewing psrc - kw */ #endif (*context->actions->end_element) (context->target, TAGNUM_OF_TAGP(context->current_tag), (char **)&context->include); } string->size = 0; context->current_attribute_number = INVALID; if (c != '>') { context->state = S_junk_tag; } else { context->current_tag = NULL; context->state = S_text; } break; } else { /* ** Handle all other end tags normally. - FM */ end_element( context, context->current_tag); } } #ifdef USE_PRETTYSRC if (psrc_view && !psrc_tagname_processed) { PSRCSTART(abracket); PUTC('<'); PUTC('/'); PSRCSTOP(abracket); PSRCSTART(tag); if (tagname_transform != 1) { if (tagname_transform == 0) LYLowerCase(string->data); else LYUpperCase(string->data); } PUTS(string->data); PSRCSTOP(tag); if ( c != '>' ) { PSRCSTART(badtag); PUTC(c); } else { PSRCSTART(abracket); PUTC('>'); PSRCSTOP(abracket); } } #endif string->size = 0; context->current_attribute_number = INVALID; if (c != '>') { if (!WHITE(c)) CTRACE((tfp,"SGML: `data, c)); context->state = S_junk_tag; } else { context->current_tag = NULL; context->state = S_text; } } break; case S_esc: /* Expecting '$'or '(' following CJK ESC. */ if (c == '$') { context->state = S_dollar; } else if (c == '(') { context->state = S_paren; } else { context->state = S_text; } PUTC(c); break; case S_dollar: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ if (c == '@' || c == 'B' || c == 'A') { context->state = S_nonascii_text; } else if (c == '(') { context->state = S_dollar_paren; } PUTC(c); break; case S_dollar_paren: /* Expecting 'C' after CJK "ESC$(". */ if (c == 'C') { context->state = S_nonascii_text; } else { context->state = S_text; } PUTC(c); break; case S_paren: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ if (c == 'B' || c == 'J' || c == 'T') { context->state = S_text; } else if (c == 'I') { context->state = S_nonascii_text; } else { context->state = S_text; } PUTC(c); break; case S_nonascii_text: /* Expecting CJK ESC after non-ASCII text. */ if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1264 */ context->state = S_esc; } PUTC(c); if (c < 32) context->state = S_text; break; case S_esc_sq: /* Expecting '$'or '(' following CJK ESC. */ if (c == '$') { context->state = S_dollar_sq; } else if (c == '(') { context->state = S_paren_sq; } else { context->state = S_squoted; } HTChunkPutc(string, c); break; case S_dollar_sq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ if (c == '@' || c == 'B' || c == 'A') { context->state = S_nonascii_text_sq; } else if (c == '(') { context->state = S_dollar_paren_sq; } HTChunkPutc(string, c); break; case S_dollar_paren_sq: /* Expecting 'C' after CJK "ESC$(". */ if (c == 'C') { context->state = S_nonascii_text_sq; } else { context->state = S_squoted; } HTChunkPutc(string, c); break; case S_paren_sq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ if (c == 'B' || c == 'J' || c == 'T') { context->state = S_squoted; } else if (c == 'I') { context->state = S_nonascii_text_sq; } else { context->state = S_squoted; } HTChunkPutc(string, c); break; case S_nonascii_text_sq: /* Expecting CJK ESC after non-ASCII text. */ if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1281 */ context->state = S_esc_sq; } HTChunkPutc(string, c); break; case S_esc_dq: /* Expecting '$'or '(' following CJK ESC. */ if (c == '$') { context->state = S_dollar_dq; } else if (c == '(') { context->state = S_paren_dq; } else { context->state = S_dquoted; } HTChunkPutc(string, c); break; case S_dollar_dq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ if (c == '@' || c == 'B' || c == 'A') { context->state = S_nonascii_text_dq; } else if (c == '(') { context->state = S_dollar_paren_dq; } HTChunkPutc(string, c); break; case S_dollar_paren_dq: /* Expecting 'C' after CJK "ESC$(". */ if (c == 'C') { context->state = S_nonascii_text_dq; } else { context->state = S_dquoted; } HTChunkPutc(string, c); break; case S_paren_dq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ if (c == 'B' || c == 'J' || c == 'T') { context->state = S_dquoted; } else if (c == 'I') { context->state = S_nonascii_text_dq; } else { context->state = S_dquoted; } HTChunkPutc(string, c); break; case S_nonascii_text_dq: /* Expecting CJK ESC after non-ASCII text. */ if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1298 */ context->state = S_esc_dq; } HTChunkPutc(string, c); break; case S_junk_tag: case S_junk_pi: if (c == '>') { #ifdef USE_PRETTYSRC if (psrc_view) { if (context->state == S_junk_tag) { PSRCSTOP(badtag); } PSRCSTART(abracket); PUTC('>'); PSRCSTOP(abracket); context->seen_nonwhite_in_junk_tag = FALSE; } #endif context->current_tag = NULL; context->state = S_text; } #ifdef USE_PRETTYSRC else if (psrc_view) { /*pack spaces until first non-space is seen*/ if (!context->seen_nonwhite_in_junk_tag) { if (!WHITE(c)) { context->seen_nonwhite_in_junk_tag = TRUE; PUTC(c); } } else PUTC(c); } #endif } /* switch on context->state */ after_switch: /* ** Check whether an external function has added ** anything to the include buffer. If so, move the ** new stuff to the beginning of active_include. - kw */ if (context->include != NULL) { if (context->include[0] == '\0') { FREE(context->include); } else { if (context->active_include && context->active_include[context->include_index] != '\0') StrAllocCat(context->include, context->active_include + context->include_index); FREE(context->active_include); context->active_include = context->include; context->include_index = 0; context->include = NULL; } } /* ** Check whether we've added anything to the recover buffer. - FM */ if (context->recover != NULL) { if (context->recover[context->recover_index] == '\0') { FREE(context->recover); context->recover_index = 0; } else { c = context->recover[context->recover_index]; context->recover_index++; goto top; } } /* ** Check whether an external function had added ** anything to the include buffer; it should now be ** in active_include. - FM / kw */ if (context->active_include != NULL) { if (context->active_include[context->include_index] == '\0') { FREE(context->active_include); context->include_index = 0; } else { if (context->current_tag_charset == UTF8 || context->T.trans_from_uni) { /* * If it looks like we would have fed UTF-8 to the * next processing stage, assume that whatever we were * fed back is in UTF-8 form, too. This won't be always * true for all uses of the include buffer, but it's a * start. - kw */ char *puni = context->active_include + context->include_index; c = *puni; clong = UCGetUniFromUtf8String(&puni); if (clong < 256 && clong >= 0) { c = ((char)(clong & 0xff)); } saved_char_in = '\0'; context->include_index = puni - context->active_include + 1; goto top1; } else { /* * Otherwise assume no UTF-8 - do charset-naive processing * and hope for the best. - kw */ c = context->active_include[context->include_index]; context->include_index++; goto top; } } } /* ** Check whether an external function has added ** anything to the csi buffer. - FM */ if (context->csi != NULL) { if (context->csi[context->csi_index] == '\0') { FREE(context->csi); context->csi_index = 0; } else { c = context->csi[context->csi_index]; context->csi_index++; goto top; } } } /* SGML_character */ PRIVATE void SGML_string ARGS2( HTStream *, context, CONST char*, str) { CONST char *p; for (p = str; *p; p++) SGML_character(context, *p); } PRIVATE void SGML_write ARGS3( HTStream *, context, CONST char*, str, int, l) { CONST char *p; CONST char *e = str+l; for (p = str; p < e; p++) SGML_character(context, *p); } /*_______________________________________________________________________ */ /* Structured Object Class ** ----------------------- */ PUBLIC CONST HTStreamClass SGMLParser = { "SGMLParser", SGML_free, SGML_abort, SGML_character, SGML_string, SGML_write, }; /* Create SGML Engine ** ------------------ ** ** On entry, ** dtd represents the DTD, along with ** actions is the sink for the data as a set of routines. ** */ PUBLIC HTStream* SGML_new ARGS3( CONST SGML_dtd *, dtd, HTParentAnchor *, anchor, HTStructured *, target) { int i; HTStream* context = (HTStream *) malloc(sizeof(*context)); if (!context) outofmem(__FILE__, "SGML_begin"); context->isa = &SGMLParser; context->string = HTChunkCreate(128); /* Grow by this much */ context->dtd = dtd; context->target = target; context->actions = (CONST HTStructuredClass*)(((HTStream*)target)->isa); /* Ugh: no OO */ context->unknown_tag = &HTTag_unrecognized; /* context->extra_tags = dtd->tags + dtd->number_of_tags; */ context->current_tag = context->slashedtag = NULL; context->state = S_text; context->kanji_buf = '\0'; context->element_stack = 0; /* empty */ context->inSELECT = FALSE; context->no_lynx_specialcodes = NO; /* special codes normally generated */ #ifdef CALLERDATA context->callerData = (void*) callerData; #endif /* CALLERDATA */ for (i = 0; i < MAX_ATTRIBUTES; i++) context->value[i] = 0; context->lead_exclamation = FALSE; context->first_dash = FALSE; context->end_comment = FALSE; context->doctype_bracket = FALSE; context->first_bracket = FALSE; context->second_bracket = FALSE; context->isHex = FALSE; context->node_anchor = anchor; /* Could be NULL? */ context->utf_count = 0; context->utf_char = 0; context->utf_buf[0] = context->utf_buf[6] = '\0'; context->utf_buf_p = context->utf_buf; UCTransParams_clear(&context->T); context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_PARSER); if (context->inUCLYhndl < 0) { HTAnchor_copyUCInfoStage(anchor, UCT_STAGE_PARSER, UCT_STAGE_MIME, -1); context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_PARSER); } context->inUCI = HTAnchor_getUCInfoStage(anchor, UCT_STAGE_PARSER); set_chartrans_handling(context, anchor, -1); context->recover = NULL; context->recover_index = 0; context->include = NULL; context->active_include = NULL; context->include_index = 0; context->url = NULL; context->csi = NULL; context->csi_index = 0; #ifdef USE_PRETTYSRC if (psrc_view) { psrc_view = FALSE; mark_htext_as_source = TRUE; SGML_string(context, "source
");
	psrc_view = TRUE;
	psrc_convert_string = FALSE;
	sgml_in_psrc_was_initialized = TRUE;
	context->seen_nonwhite_in_junk_tag = FALSE;
    }
#endif

    return context;
}

/*		Asian character conversion functions
**		====================================
**
**	Added 24-Mar-96 by FM, based on:
**
////////////////////////////////////////////////////////////////////////
Copyright (c) 1993 Electrotechnical Laboratory (ETL)

Permission to use, copy, modify, and distribute this material
for any purpose and without fee is hereby granted, provided
that the above copyright notice and this permission notice
appear in all copies, and that the name of ETL not be
used in advertising or publicity pertaining to this
material without the specific, prior written permission
of an authorized representative of ETL.
ETL MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY
OF THIS MATERIAL FOR ANY PURPOSE.  IT IS PROVIDED "AS IS",
WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
/////////////////////////////////////////////////////////////////////////
Content-Type:	program/C; charset=US-ASCII
Program:	SJIS.c
Author:		Yutaka Sato 
Description:
History:
	930923	extracted from codeconv.c of cosmos
///////////////////////////////////////////////////////////////////////
*/

PUBLIC int TREAT_SJIS = 1;

PUBLIC void JISx0201TO0208_EUC ARGS4(
	register unsigned char,		IHI,
	register unsigned char,		ILO,
	register unsigned char *,	OHI,
	register unsigned char *,	OLO)
{
    static char *table[] = {
	"\241\243",	/* A1,A3 */
	"\241\326",	/* A1,D6 */
	"\241\327",	/* A1,D7 */
	"\241\242",	/* A1,A2 */
	"\241\246",	/* A1,A6 */
	"\245\362",	/* A5,F2 */
	"\245\241",	/* A5,A1 */
	"\245\243",	/* A5,A3 */
	"\245\245",	/* A5,A5 */
	"\245\247",	/* A5,A7 */
	"\245\251",	/* A5,A9 */
	"\245\343",	/* A5,E3 */
	"\245\345",	/* A5,E5 */
	"\245\347",	/* A5,E7 */
	"\245\303",	/* A5,C3 */
	"\241\274",	/* A1,BC */
	"\245\242",	/* A5,A2 */
	"\245\244",	/* A5,A4 */
	"\245\246",	/* A5,A6 */
	"\245\250",	/* A5,A8 */
	"\245\252",	/* A5,AA */
	"\245\253",	/* A5,AB */
	"\245\255",	/* A5,AD */
	"\245\257",	/* A5,AF */
	"\245\261",	/* A5,B1 */
	"\245\263",	/* A5,B3 */
	"\245\265",	/* A5,B5 */
	"\245\267",	/* A5,B7 */
	"\245\271",	/* A5,B9 */
	"\245\273",	/* A5,BB */
	"\245\275",	/* A5,BD */
	"\245\277",	/* A5,BF */
	"\245\301",	/* A5,C1 */
	"\245\304",	/* A5,C4 */
	"\245\306",	/* A5,C6 */
	"\245\310",	/* A5,C8 */
	"\245\312",	/* A5,CA */
	"\245\313",	/* A5,CB */
	"\245\314",	/* A5,CC */
	"\245\315",	/* A5,CD */
	"\245\316",	/* A5,CE */
	"\245\317",	/* A5,CF */
	"\245\322",	/* A5,D2 */
	"\245\325",	/* A5,D5 */
	"\245\330",	/* A5,D8 */
	"\245\333",	/* A5,DB */
	"\245\336",	/* A5,DE */
	"\245\337",	/* A5,DF */
	"\245\340",	/* A5,E0 */
	"\245\341",	/* A5,E1 */
	"\245\342",	/* A5,E2 */
	"\245\344",	/* A5,E4 */
	"\245\346",	/* A5,E6 */
	"\245\350",	/* A5,E8 */
	"\245\351",	/* A5,E9 */
	"\245\352",	/* A5,EA */
	"\245\353",	/* A5,EB */
	"\245\354",	/* A5,EC */
	"\245\355",	/* A5,ED */
	"\245\357",	/* A5,EF */
	"\245\363",	/* A5,F3 */
	"\241\253",	/* A1,AB */
	"\241\254"	/* A1,AC */
    };

    if ((IHI == 0x8E) && (ILO >= 0xA1) && (ILO <= 0xDF)) {
	*OHI = table[ILO - 0xA1][0];
	*OLO = table[ILO - 0xA1][1];
    } else {
	*OHI = IHI;
	*OLO = ILO;
    }
}

PRIVATE int IS_SJIS_STR ARGS1(CONST unsigned char *, str)
{
    CONST unsigned char *s;
    unsigned char ch;
    int is_sjis = 0;

    s = str;
    while ((ch = *s++) != '\0') {
	if (ch & 0x80)
	    if (IS_SJIS(ch, *s, is_sjis))
		return 1;
    }
    return 0;
}

PUBLIC unsigned char * SJIS_TO_JIS1 ARGS3(
	register unsigned char,		HI,
	register unsigned char,		LO,
	register unsigned char *,	JCODE)
{
    HI -= (unsigned char) ((HI <= 0x9F) ? 0x71 : 0xB1);
    HI = (unsigned char) ((HI << 1) + 1);
    if (0x7F < LO)
	LO--;
    if (0x9E <= LO) {
	LO -= (unsigned char) 0x7D;
	HI++;
    } else {
	LO -= (unsigned char) 0x1F;
    }
    JCODE[0] = HI;
    JCODE[1] = LO;
    return JCODE;
}

PUBLIC unsigned char * JIS_TO_SJIS1 ARGS3(
	register unsigned char,		HI,
	register unsigned char,		LO,
	register unsigned char *,	SJCODE)
{
    if (HI & 1)
	LO += (unsigned char) 0x1F;
    else
	LO += (unsigned char) 0x7D;
    if (0x7F <= LO)
	LO++;

    HI = (unsigned char) (((HI - 0x21) >> 1) + 0x81);
    if (0x9F < HI)
	HI += (unsigned char) 0x40;
    SJCODE[0] = HI;
    SJCODE[1] = LO;
    return SJCODE;
}

PUBLIC unsigned char * EUC_TO_SJIS1 ARGS3(
	unsigned char,			HI,
	unsigned char,			LO,
	register unsigned char *,	SJCODE)
{
    if (HI == 0x8E)
	JISx0201TO0208_EUC(HI, LO, &HI, &LO);
    JIS_TO_SJIS1((unsigned char) (HI & 0x7F), (unsigned char) (LO & 0x7F), SJCODE);
    return SJCODE;
}

PUBLIC void JISx0201TO0208_SJIS ARGS3(
	register unsigned char,		I,
	register unsigned char *,	OHI,
	register unsigned char *,	OLO)
{
    unsigned char SJCODE[2];

    JISx0201TO0208_EUC(0x8E, I, OHI, OLO);
    JIS_TO_SJIS1((unsigned char)(*OHI & 0x7F), (unsigned char)(*OLO & 0x7F), SJCODE);
    *OHI = SJCODE[0];
    *OLO = SJCODE[1];
}

PUBLIC unsigned char * SJIS_TO_EUC1 ARGS3(
	unsigned char,		HI,
	unsigned char,		LO,
	unsigned char *,	data)
{
    SJIS_TO_JIS1(HI, LO, data);
    data[0] |= 0x80;
    data[1] |= 0x80;
    return data;
}

PUBLIC unsigned char * SJIS_TO_EUC ARGS2(
	unsigned char *,	src,
	unsigned char *,	dst)
{
    register unsigned char hi, lo, *sp, *dp;
    register int in_sjis = 0;

    in_sjis = IS_SJIS_STR(src);
    for (sp = src, dp = dst; (hi = sp[0]) != '\0';) {
	lo = sp[1];
	if (TREAT_SJIS && IS_SJIS(hi, lo, in_sjis)) {
	    SJIS_TO_JIS1(hi, lo, dp);
	    dp[0] |= 0x80;
	    dp[1] |= 0x80;
	    dp += 2;
	    sp += 2;
	} else
	    *dp++ = *sp++;
    }
    *dp = 0;
    return dst;
}

PUBLIC unsigned char * EUC_TO_SJIS ARGS2(
	unsigned char *,	src,
	unsigned char *,	dst)
{
    register unsigned char *sp, *dp;

    for (sp = src, dp = dst; *sp;) {
	if (*sp & 0x80) {
	    if (sp[1] && (sp[1] & 0x80)) {
		JIS_TO_SJIS1((unsigned char)(sp[0] & 0x7F), (unsigned char)(sp[1] & 0x7F), dp);
		dp += 2;
		sp += 2;
	    } else {
		sp++;
	    }
	} else {
	    *dp++ = *sp++;
	}
    }
    *dp = 0;
    return dst;
}

#define Strcpy(a,b)	(strcpy((char*)a,(CONST char*)b),&a[strlen((CONST char*)a)])

PUBLIC unsigned char *EUC_TO_JIS ARGS4(
	unsigned char *,	src,
	unsigned char *,	dst,
	CONST char *,		toK,
	CONST char *,		toA)
{
    register unsigned char kana_mode = 0;
    register unsigned char cch;
    register unsigned char *sp = src;
    register unsigned char *dp = dst;
    int is_JIS = 0;

    while ((cch = *sp++) != '\0') {
	if (cch & 0x80) {
	    if (!IS_EUC(cch, *sp)) {
		if (cch == 0xA0 && is_JIS)	/* ignore NBSP */
		    continue;
		is_JIS++;
		*dp++ = cch;
		continue;
	    }
	    if (!kana_mode) {
		kana_mode = (unsigned char) ~kana_mode;
		dp = Strcpy(dp, toK);
	    }
	    if (*sp & 0x80) {
		*dp++ = (unsigned char) (cch & ~0x80);
		*dp++ = (unsigned char) (*sp++ & ~0x80);
	    }
	} else {
	    if (kana_mode) {
		kana_mode = (unsigned char) ~kana_mode;
		dp = Strcpy(dp, toA);
	    }
	    *dp++ = cch;
	}
    }
    if (kana_mode)
	dp = Strcpy(dp, toA);

    if (dp)
	*dp = 0;
    return dst;
}

#define	IS_JIS7(c1,c2)	(0x20<(c1)&&(c1)<0x7F && 0x20<(c2)&&(c2)<0x7F)
#define SO		('N'-0x40)
#define SI		('O'-0x40)

PUBLIC int repair_JIS = 0;

PRIVATE CONST unsigned char *repairJIStoEUC ARGS2(
	CONST unsigned char *,	src,
	unsigned char **,	dstp)
{
    CONST unsigned char *s;
    unsigned char *d, ch1, ch2;

    d = *dstp;
    s = src;
    while ((ch1 = s[0]) && (ch2 = s[1])) {
	s += 2;
	if (ch1 == '(')
	    if (ch2 == 'B' || ch2 == 'J') {
		*dstp = d;
		return s;
	    }
	if (!IS_JIS7(ch1, ch2))
	    return 0;

	*d++ = (unsigned char) (0x80 | ch1);
	*d++ = (unsigned char) (0x80 | ch2);
    }
    return 0;
}

#if 0	/* NOTUSED */

static struct {
    char *ee;
    char de;
} entities[] = {
    {"<", '<' },
    {">", '>' },
    {"&", '&'},
    {""", '"'},
    {NULL, 0}
};

PRIVATE int isHTMLentity ARGS2(
	char *, str,
	int *, chp)
{
    int ei, ej;
    char *es, ec;
    int off;

    off = *str == '&' ? 0 : 1;
    for (ei = 0; (es = entities[ei].ee) != '\0'; ei++) {
	for (ej = 0; (ec = es[off + ej]) != '\0'; ej++) {
	    if (ec != str[ej])
		break;
	    if (ec == ';') {
		*chp = entities[ei].de;
		return ej + 1;
	    }
	}
    }
    return 0;
}

#define sputc(sp,ch)	(sp?(*sp++ = ch):ch)

PUBLIC int FIX_2022 ARGS3(
	char *, src,
	char *, dst,
	char *, ctype)
{
    int in2B;
    char ch1, ch2, *sp, *dp;
    int bad;
    int isHTML, len, ech;

    in2B = 0;
    sp = src;
    dp = dst;
    bad = 0;

    isHTML = strcasecomp(ctype, "text/html") == 0;

    while ((ch1 = *sp++) != '\0') {
	if (ch1 == ESC) {
	    if (*sp == TO_2BCODE) {
		if (sp[1] == 'B' || sp[1] == '@') {
		    in2B = 1;
		    sputc(dp, ch1);
		    sputc(dp, *sp++);
		    sputc(dp, *sp++);
		    continue;
		}
	    } else if (*sp == TO_1BCODE) {
		if (sp[1] == 'B' || sp[1] == 'J') {
		    in2B = 0;
		    sputc(dp, ch1);
		    sputc(dp, *sp++);
		    sputc(dp, *sp++);
		    continue;
		}
	    }
	}
	if (in2B) {
	    if ((ch1 <= 0x20)
		|| (sp[0] <= 0x20)
		|| (ch1 == '<' && sp[0] == '/')
		|| (sp[0] == '<' && sp[1] == '/')) {
		in2B = 0;
		sputc(dp, ESC);
		sputc(dp, TO_1BCODE);
		sputc(dp, 'B');
		sputc(dp, ch1);
		bad = 1;
		continue;
	    }
	    if (isHTML && ch1 == '&')
		if ((len = isHTMLentity(sp, &ech)) != '\0')
		    if (sp[len] != 0) {
			ch1 = ech;
			sp += len;
			bad = 1;
		    }
	    ch2 = *sp++;

	    if (isHTML && ch2 == '&')
		if ((len = isHTMLentity(sp, &ech)) != '\0')
		    if (sp[len] != 0) {
			ch2 = ech;
			sp += len;
			bad = 1;
		    }
	    sputc(dp, ch1);
	    sputc(dp, ch2);
	} else {
	    sputc(dp, ch1);
	}
    }
    sputc(dp, 0);
    return bad;
}

#endif

PUBLIC unsigned char *TO_EUC ARGS2(
	CONST unsigned char *,	jis,
	unsigned char *,	euc)
{
    register CONST unsigned char *s;
    register unsigned char c, jis_stat;
    unsigned char *d;
    register int to1B, to2B;
    register int in_sjis = 0;
    static int nje;
    int n8bits;
    int is_JIS;

    nje++;
    n8bits = 0;
    s = jis;
    d = euc;
    jis_stat = 0;
    to2B = TO_2BCODE;
    to1B = TO_1BCODE;
    in_sjis = IS_SJIS_STR(jis);
    is_JIS = 0;

    while ((c = *s++) != '\0') {
	if (c == 0x80)
	    continue;		/* ignore it */
	if (c == 0xA0 && is_JIS)
	    continue;		/* ignore Non-breaking space */

	if (c == to2B && jis_stat == 0 && repair_JIS) {
	    if (*s == 'B' || *s == '@') {
		CONST unsigned char *ts;
		if ((ts = repairJIStoEUC(s + 1, &d)) != NULL) {
		    s = ts;
		    continue;
		}
	    }
	}
	if (c == ESC) {
	    if (*s == to2B) {
		if ((s[1] == 'B') || (s[1] == '@')) {
		    jis_stat = 0x80;
		    s += 2;
		    is_JIS++;
		    continue;
		}
		jis_stat = 0;
	    } else if (*s == to1B) {
		jis_stat = 0;
		if ((s[1] == 'B') || (s[1] == 'J') || (s[1] == 'H')) {
		    s += 2;
		    continue;
		}
	    } else if (*s == ',') {	/* MULE */
		jis_stat = 0;
	    }
	}
	if (c & 0x80)
	    n8bits++;

	if (IS_SJIS(c, *s, in_sjis)) {
	    SJIS_TO_EUC1(c, *s, d);
	    d += 2;
	    s++;
	    is_JIS++;
	} else if (jis_stat) {
	    if (c <= 0x20 || 0x7F <= c) {
		*d++ = c;
		if (c == '\n')
		    jis_stat = 0;
	    } else {
		if (IS_JIS7(c, *s)) {
		    *d++ = jis_stat | c;
		    *d++ = jis_stat | *s++;
		} else
		    *d++ = c;
	    }
	} else {
	    if (n8bits == 0 && (c == SI || c == SO)) {
	    } else {
		*d++ = c;
	    }
	}
    }
    *d = 0;
    return euc;
}

#define non94(ch) ((ch) <= 0x20 || (ch) == 0x7F)

PRIVATE int is_EUC_JP ARGS1(unsigned char *, euc)
{
    unsigned char *cp;
    int ch1, ch2;

    for (cp = euc; (ch1 = *cp) != '\0'; cp++) {
	if (ch1 & 0x80) {
	    ch2 = cp[1] & 0xFF;
	    if ((ch2 & 0x80) == 0) {
		/* sv1log("NOT_EUC1[%x][%x]\n",ch1,ch2); */
		return 0;
	    }
	    if (non94(ch1 & 0x7F) || non94(ch2 & 0x7F)) {
		/* sv1log("NOT_EUC2[%x][%x]\n",ch1,ch2); */
		return 0;
	    }
	    cp++;
	}
    }
    return 1;
}

PUBLIC void TO_SJIS ARGS2(
	CONST unsigned char *,	any,
	unsigned char *,	sjis)
{
    unsigned char *euc;

    euc = malloc(strlen((CONST char *) any) + 1);
#ifdef CJK_EX
    if (!euc)
	outofmem(__FILE__, "TO_SJIS");
#endif
    TO_EUC(any, euc);
    if (is_EUC_JP(euc))
	EUC_TO_SJIS(euc, sjis);
    else
	strcpy((char *) sjis, (CONST char *) any);
    free(euc);
}

PUBLIC void TO_JIS ARGS2(
	CONST unsigned char *,	any,
	unsigned char *,	jis)
{
    unsigned char *euc;

    if (any[0] == 0) {
	jis[0] = 0;
	return;
    }
    euc = malloc(strlen((CONST char *) any) + 1);
#ifdef CJK_EX
    if (!euc)
	outofmem(__FILE__, "TO_JIS");
#endif
    TO_EUC(any, euc);
#if 0
    if (is_EUC_JP(euc))
	EUC_TO_JIS(euc, jis, TO_KANJI, TO_ASCII);
    else
	strcpy(jis, any);
#endif
    is_EUC_JP(euc);
    EUC_TO_JIS(euc, jis, TO_KANJI, TO_ASCII);

    free(euc);
}