diff options
author | Thomas E. Dickey <dickey@invisible-island.net> | 2010-04-29 22:00:22 -0400 |
---|---|---|
committer | Thomas E. Dickey <dickey@invisible-island.net> | 2010-04-29 22:00:22 -0400 |
commit | dc748b1c47baadafae2c90f0e188927b11b7e029 (patch) | |
tree | c728869dc6504570b9bffb7459ccbdd1bf264a9f /WWW/Library/Implementation/SGML.c | |
parent | d4093cadbda3787dfb165954f8f6521790cfac86 (diff) | |
download | lynx-snapshots-dc748b1c47baadafae2c90f0e188927b11b7e029.tar.gz |
snapshot of project "lynx", label v2_8_8dev_6c
Diffstat (limited to 'WWW/Library/Implementation/SGML.c')
-rw-r--r-- | WWW/Library/Implementation/SGML.c | 4959 |
1 files changed, 0 insertions, 4959 deletions
diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c deleted file mode 100644 index 3a388b90..00000000 --- a/WWW/Library/Implementation/SGML.c +++ /dev/null @@ -1,4959 +0,0 @@ -/* - * $LynxId: SGML.c,v 1.132 2009/08/27 10:29:27 tom Exp $ - * - * General SGML Parser code SGML.c - * ======================== - * - * This module implements an HTStream object. To parse an - * SGML file, create this object which is a parser. The object - * is (currently) created by being passed a DTD structure, - * and a target HTStructured object at which to throw the parsed stuff. - * - * 6 Feb 93 Binary searches used. Interface modified. - */ - -#include <HTUtils.h> - -#include <SGML.h> -#include <HTMLDTD.h> -#include <HTAccess.h> -#include <HTCJK.h> /* FIXME: this doesn't belong in SGML.c */ -#include <UCMap.h> -#include <UCDefs.h> -#include <UCAux.h> - -#include <HTChunk.h> -#include <HTUtils.h> - -#include <LYCharSets.h> -#include <LYCharVals.h> /* S/390 -- gil -- 0635 */ -#include <LYGlobalDefs.h> -#include <LYStrings.h> -#include <LYLeaks.h> -#include <LYUtils.h> - -#ifdef USE_COLOR_STYLE -# include <LYStyle.h> -#endif -#ifdef USE_PRETTYSRC -# include <LYPrettySrc.h> -#endif - -#define INVALID (-1) - -static int sgml_offset; - -#ifdef USE_PRETTYSRC - -static char *entity_string; /* this is used for printing entity name. - - Unconditionally added since redundant assigments don't hurt much */ - -static void fake_put_character(void *p GCC_UNUSED, - char c GCC_UNUSED) -{ -} - -#define START TRUE -#define STOP FALSE - -#define PUTS_TR(x) psrc_convert_string = TRUE; PUTS(x) - -#endif - -/* my_casecomp() - optimized by the first character, NOT_ASCII ok */ -#define my_casecomp(a,b) ((TOUPPER(*a) == TOUPPER(*b)) ? \ - AS_casecomp(a,b) : \ - (TOASCII(TOUPPER(*a)) - TOASCII(TOUPPER(*b)))) - - /* will use partially inlined version */ -#define orig_HTChunkPutUtf8Char HTChunkPutUtf8Char -#undef HTChunkPutUtf8Char - -/* ...used for comments and attributes value like href... */ -#define HTChunkPutUtf8Char(ch,x) \ - { \ - if ((TOASCII(x) < 128) && (ch->size < ch->allocated)) \ - ch->data[ch->size++] = (char)x; \ - else \ - orig_HTChunkPutUtf8Char(ch,x); \ - } - -#define PUTS(str) ((*context->actions->put_string)(context->target, str)) -#define PUTC(ch) ((*context->actions->put_character)(context->target, (char) ch)) -#define PUTUTF8(code) (UCPutUtf8_charstring((HTStream *)context->target, \ - (putc_func_t*)(context->actions->put_character), code)) - -#ifdef USE_PRETTYSRC -#define PRETTYSRC_PUTC(c) if (psrc_view) PUTC(c) -#else -#define PRETTYSRC_PUTC(c) /* nothing */ -#endif - -/*the following macros are used for pretty source view. */ -#define IS_C(attr) (attr.type == HTMLA_CLASS) - -HTCJKlang HTCJK = NOCJK; /* CJK enum value. */ -BOOL HTPassEightBitRaw = FALSE; /* Pass 161-172,174-255 raw. */ -BOOL HTPassEightBitNum = FALSE; /* Pass ^ numeric entities raw. */ -BOOL HTPassHighCtrlRaw = FALSE; /* Pass 127-160,173, raw. */ -BOOL HTPassHighCtrlNum = FALSE; /* Pass €-Ÿ raw. */ - -/* The State (context) of the parser - * - * This is passed with each call to make the parser reentrant - * - */ - -#define MAX_ATTRIBUTES 36 /* Max number of attributes per element */ - -/* Element Stack - * ------------- - * This allows us to return down the stack reselecting styles. - * As we return, attribute values will be garbage in general. - */ -typedef struct _HTElement HTElement; -struct _HTElement { - HTElement *next; /* Previously nested element or 0 */ - HTTag *tag; /* The tag at this level */ -}; - -typedef enum { - S_text = 0 - ,S_attr - ,S_attr_gap - ,S_comment - ,S_cro - ,S_doctype - ,S_dollar - ,S_dollar_dq - ,S_dollar_paren - ,S_dollar_paren_dq - ,S_dollar_paren_sq - ,S_dollar_sq - ,S_dquoted - ,S_end - ,S_entity - ,S_equals - ,S_ero - ,S_esc - ,S_esc_dq - ,S_esc_sq - ,S_exclamation - ,S_in_kanji - ,S_incro - ,S_junk_tag - ,S_litteral - ,S_marked - ,S_nonascii_text - ,S_nonascii_text_dq - ,S_nonascii_text_sq - ,S_paren - ,S_paren_dq - ,S_paren_sq - ,S_pcdata - ,S_pi - ,S_script - ,S_sgmlatt - ,S_sgmlele - ,S_sgmlent - ,S_squoted - ,S_tag - ,S_tag_gap - ,S_tagname_slash - ,S_value -} sgml_state; - -/* Internal Context Data Structure - * ------------------------------- - */ -struct _HTStream { - - const HTStreamClass *isa; /* inherited from HTStream */ - - const SGML_dtd *dtd; - const HTStructuredClass *actions; /* target class */ - HTStructured *target; /* target object */ - - HTTag *current_tag; - HTTag *slashedtag; - const HTTag *unknown_tag; - BOOL extended_html; /* xhtml */ - BOOL strict_xml; /* xml */ - BOOL inSELECT; - BOOL no_lynx_specialcodes; - int current_attribute_number; - HTChunk *string; - int leading_spaces; - int trailing_spaces; - HTElement *element_stack; - sgml_state state; - unsigned char kanji_buf; -#ifdef CALLERDATA - void *callerData; -#endif /* CALLERDATA */ - BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */ - char *value[MAX_ATTRIBUTES]; /* NULL, or strings alloc'd with StrAllocCopy_extra() */ - - BOOL lead_exclamation; - BOOL first_dash; - BOOL end_comment; - BOOL doctype_bracket; - BOOL first_bracket; - BOOL second_bracket; - BOOL isHex; - - HTParentAnchor *node_anchor; - LYUCcharset *inUCI; /* pointer to anchor UCInfo */ - int inUCLYhndl; /* charset we are fed */ - LYUCcharset *outUCI; /* anchor UCInfo for target */ - int outUCLYhndl; /* charset for target */ - char utf_count; - UCode_t utf_char; - char utf_buf[8]; - char *utf_buf_p; - UCTransParams T; - int current_tag_charset; /* charset to pass attributes */ - - char *recover; - int recover_index; - char *include; - char *active_include; - int include_index; - char *url; - char *csi; - int csi_index; -#ifdef USE_PRETTYSRC - BOOL cur_attr_is_href; - BOOL cur_attr_is_name; -#endif -}; - -#ifdef NO_LYNX_TRACE -#define state_name(n) "state" -#else -static const char *state_name(sgml_state n) -{ - const char *result = "?"; - /* *INDENT-OFF* */ - switch (n) { - case S_attr: result = "S_attr"; break; - case S_attr_gap: result = "S_attr_gap"; break; - case S_comment: result = "S_comment"; break; - case S_cro: result = "S_cro"; break; - case S_doctype: result = "S_doctype"; break; - case S_dollar: result = "S_dollar"; break; - case S_dollar_dq: result = "S_dollar_dq"; break; - case S_dollar_paren: result = "S_dollar_paren"; break; - case S_dollar_paren_dq: result = "S_dollar_paren_dq"; break; - case S_dollar_paren_sq: result = "S_dollar_paren_sq"; break; - case S_dollar_sq: result = "S_dollar_sq"; break; - case S_dquoted: result = "S_dquoted"; break; - case S_end: result = "S_end"; break; - case S_entity: result = "S_entity"; break; - case S_equals: result = "S_equals"; break; - case S_ero: result = "S_ero"; break; - case S_esc: result = "S_esc"; break; - case S_esc_dq: result = "S_esc_dq"; break; - case S_esc_sq: result = "S_esc_sq"; break; - case S_exclamation: result = "S_exclamation"; break; - case S_in_kanji: result = "S_in_kanji"; break; - case S_incro: result = "S_incro"; break; - case S_pi: result = "S_pi"; break; - case S_junk_tag: result = "S_junk_tag"; break; - case S_litteral: result = "S_litteral"; break; - case S_marked: result = "S_marked"; break; - case S_nonascii_text: result = "S_nonascii_text"; break; - case S_nonascii_text_dq: result = "S_nonascii_text_dq"; break; - case S_nonascii_text_sq: result = "S_nonascii_text_sq"; break; - case S_paren: result = "S_paren"; break; - case S_paren_dq: result = "S_paren_dq"; break; - case S_paren_sq: result = "S_paren_sq"; break; - case S_pcdata: result = "S_pcdata"; break; - case S_script: result = "S_script"; break; - case S_sgmlatt: result = "S_sgmlatt"; break; - case S_sgmlele: result = "S_sgmlele"; break; - case S_sgmlent: result = "S_sgmlent"; break; - case S_squoted: result = "S_squoted"; break; - case S_tag: result = "S_tag"; break; - case S_tag_gap: result = "S_tag_gap"; break; - case S_tagname_slash: result = "S_tagname_slash"; break; - case S_text: result = "S_text"; break; - case S_value: result = "S_value"; break; - } - /* *INDENT-ON* */ - - return result; -} -#endif - -/* storage for Element Stack */ -#define DEPTH 10 -static HTElement pool[DEPTH]; -static int depth = 0; - -static HTElement *pool_alloc(void) -{ - depth++; - if (depth > DEPTH) - return (HTElement *) malloc(sizeof(HTElement)); - return (pool + depth - 1); -} - -static void pool_free(HTElement * e) -{ - if (depth > DEPTH) - FREE(e); - depth--; - return; -} - -#ifdef USE_PRETTYSRC - -static void HTMLSRC_apply_markup(HTStream *context, - HTlexeme lexeme, - BOOL start) -{ - HT_tagspec *ts = *((start ? lexeme_start : lexeme_end) + lexeme); - - while (ts) { -#ifdef USE_COLOR_STYLE - if (ts->start) { - current_tag_style = ts->style; - force_current_tag_style = TRUE; - forced_classname = ts->class_name; - force_classname = TRUE; - } -#endif - CTRACE((tfp, ts->start ? "SRCSTART %d\n" : "SRCSTOP %d\n", (int) lexeme)); - if (ts->start) - (*context->actions->start_element) (context->target, - ts->element, - ts->present, - (const char **) ts->value, - context->current_tag_charset, - &context->include); - else - (*context->actions->end_element) (context->target, - ts->element, - &context->include); - ts = ts->next; - } -} - -#define PSRCSTART(x) HTMLSRC_apply_markup(context,HTL_##x,START) -#define PSRCSTOP(x) HTMLSRC_apply_markup(context,HTL_##x,STOP) - -#define attr_is_href context->cur_attr_is_href -#define attr_is_name context->cur_attr_is_name -#endif - -static void set_chartrans_handling(HTStream *context, - HTParentAnchor *anchor, - int chndl) -{ - if (chndl < 0) { - /* - * Nothing was set for the parser in earlier stages, so the HTML - * parser's UCLYhndl should still be its default. - FM - */ - chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_STRUCTURED); - if (chndl < 0) - /* - * That wasn't set either, so seek the HText default. - FM - */ - chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT); - if (chndl < 0) - /* - * That wasn't set either, so assume the current display character - * set. - FM - */ - chndl = current_char_set; - /* - * Try to set the HText and HTML stages' chartrans info with the - * default lock level (will not be changed if it was set previously - * with a higher lock level). - FM - */ - HTAnchor_setUCInfoStage(anchor, chndl, - UCT_STAGE_HTEXT, - UCT_SETBY_DEFAULT); - HTAnchor_setUCInfoStage(anchor, chndl, - UCT_STAGE_STRUCTURED, - UCT_SETBY_DEFAULT); - /* - * Get the chartrans info for output to the HTML parser. - FM - */ - context->outUCI = HTAnchor_getUCInfoStage(anchor, - UCT_STAGE_STRUCTURED); - context->outUCLYhndl = HTAnchor_getUCLYhndl(context->node_anchor, - UCT_STAGE_STRUCTURED); - } - /* - * Set the in->out transformation parameters. - FM - */ - UCSetTransParams(&context->T, - context->inUCLYhndl, context->inUCI, - context->outUCLYhndl, context->outUCI); - /* - * This is intended for passing the SGML parser's input charset as an - * argument in each call to the HTML parser's start tag function, but it - * would be better to call a Lynx_HTML_parser function to set an element in - * its HTStructured object, itself, if this were needed. - FM - */ -#ifndef EXP_JAPANESEUTF8_SUPPORT - if (IS_CJK_TTY) { - context->current_tag_charset = -1; - } else -#endif - if (context->T.transp) { - context->current_tag_charset = context->inUCLYhndl; - } else if (context->T.decode_utf8) { - context->current_tag_charset = context->inUCLYhndl; - } else if (context->T.do_8bitraw || - context->T.use_raw_char_in) { - context->current_tag_charset = context->inUCLYhndl; - } else if (context->T.output_utf8 || - context->T.trans_from_uni) { - context->current_tag_charset = UCGetLYhndl_byMIME("utf-8"); - } else { - context->current_tag_charset = LATIN1; - } -} - -static void change_chartrans_handling(HTStream *context) -{ - int new_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor, - UCT_STAGE_PARSER); - - if (new_LYhndl != context->inUCLYhndl && - new_LYhndl >= 0) { - /* - * Something changed. but ignore if a META wants an unknown charset. - */ - LYUCcharset *new_UCI = HTAnchor_getUCInfoStage(context->node_anchor, - UCT_STAGE_PARSER); - - if (new_UCI) { - LYUCcharset *next_UCI = HTAnchor_getUCInfoStage(context->node_anchor, - UCT_STAGE_STRUCTURED); - int next_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor, UCT_STAGE_STRUCTURED); - - context->inUCI = new_UCI; - context->inUCLYhndl = new_LYhndl; - context->outUCI = next_UCI; - context->outUCLYhndl = next_LYhndl; - set_chartrans_handling(context, - context->node_anchor, next_LYhndl); - } - } -} - -#ifdef USE_COLOR_STYLE -#include <AttrList.h> -static int current_is_class = 0; -#endif - -/* Handle Attribute - * ---------------- - */ -/* PUBLIC const char * SGML_default = ""; ?? */ - -static void handle_attribute_name(HTStream *context, const char *s) -{ - HTTag *tag = context->current_tag; - const attr *attributes = tag->attributes; - int high, low, i, diff; - -#ifdef USE_PRETTYSRC - if (psrc_view) { - attr_is_href = FALSE; - attr_is_name = FALSE; - } -#endif - /* - * Ignore unknown tag. - KW - */ - if (tag == context->unknown_tag) { -#ifdef USE_PRETTYSRC - if (psrc_view) - context->current_attribute_number = 1; /* anything !=INVALID */ -#endif - return; - } - - /* - * Binary search for attribute name. - */ - for (low = 0, high = tag->number_of_attributes; - high > low; - diff < 0 ? (low = i + 1) : (high = i)) { - i = (low + (high - low) / 2); - diff = my_casecomp(attributes[i].name, s); - if (diff == 0) { /* success: found it */ - context->current_attribute_number = i; -#ifdef USE_PRETTYSRC - if (psrc_view) { - attr_is_name = (BOOL) (attributes[i].type == HTMLA_ANAME); - attr_is_href = (BOOL) (attributes[i].type == HTMLA_HREF); - } else -#endif - { - context->present[i] = YES; - Clear_extra(context->value[i]); -#ifdef USE_COLOR_STYLE -# ifdef USE_PRETTYSRC - current_is_class = IS_C(attributes[i]); -# else - current_is_class = (!strcasecomp("class", s)); -# endif - CTRACE((tfp, "SGML: found attribute %s, %d\n", s, current_is_class)); -#endif - } - return; - } - /* if */ - } /* for */ - - CTRACE((tfp, "SGML: Unknown attribute %s for tag %s\n", - s, NonNull(context->current_tag->name))); - context->current_attribute_number = INVALID; /* Invalid */ -} - -/* Handle attribute value - * ---------------------- - */ -static void handle_attribute_value(HTStream *context, const char *s) -{ - if (context->current_attribute_number != INVALID) { - StrAllocCopy_extra(context->value[context->current_attribute_number], s); -#ifdef USE_COLOR_STYLE - if (current_is_class) { - strncpy(class_string, s, TEMPSTRINGSIZE); - CTRACE((tfp, "SGML: class is '%s'\n", s)); - } else { - CTRACE((tfp, "SGML: attribute value is '%s'\n", s)); - } -#endif - } else { - CTRACE((tfp, "SGML: Attribute value %s ***ignored\n", s)); - } - context->current_attribute_number = INVALID; /* can't have two assignments! */ -} - -/* - * Translate some Unicodes to Lynx special codes and output them. - * Special codes - ones those output depend on parsing. - * - * Additional issue, like handling bidirectional text if necessary - * may be called from here: zwnj (8204), zwj (8205), lrm (8206), rlm (8207) - * - currently they are ignored in SGML.c and LYCharUtils.c - * but also in UCdomap.c because they are non printable... - * - */ -static BOOL put_special_unicodes(HTStream *context, UCode_t code) -{ - /* (Tgf_nolyspcl) */ - if (context->no_lynx_specialcodes) { - /* - * We were asked by a "DTD" flag to not generate lynx specials. - kw - */ - return NO; - } - - if (code == CH_NBSP) { /* S/390 -- gil -- 0657 */ - /* - * Use Lynx special character for nbsp. - */ -#ifdef USE_PRETTYSRC - if (!psrc_view) -#endif - PUTC(HT_NON_BREAK_SPACE); - } else if (code == CH_SHY) { - /* - * Use Lynx special character for shy. - */ -#ifdef USE_PRETTYSRC - if (!psrc_view) -#endif - PUTC(LY_SOFT_HYPHEN); - } else if (code == 8194 || code == 8201) { - /* - * Use Lynx special character for ensp or thinsp. - * - * Originally, Lynx use space '32' as word delimiter and omits this - * space at end of line if word is wrapped to the next line. There are - * several other spaces in the Unicode repertoire and we should teach - * Lynx to understand them, not only as regular characters but in the - * context of line wrapping. Unfortunately, if we use HT_EN_SPACE we - * override the chartrans tables for those spaces with a single '32' - * for all (but do line wrapping more fancy). - * - * We may treat emsp as one or two ensp (below). - */ -#ifdef USE_PRETTYSRC - if (!psrc_view) -#endif - PUTC(HT_EN_SPACE); - } else if (code == 8195) { - /* - * Use Lynx special character for emsp. - */ -#ifdef USE_PRETTYSRC - if (!psrc_view) { -#endif - /* PUTC(HT_EN_SPACE); let's stay with a single space :) */ - PUTC(HT_EN_SPACE); -#ifdef USE_PRETTYSRC - } -#endif - } else { - /* - * Return NO if nothing done. - */ - return NO; - } - /* - * We have handled it. - */ - return YES; -} - -#ifdef USE_PRETTYSRC -static void put_pretty_entity(HTStream *context, int term) -{ - PSRCSTART(entity); - PUTC('&'); - PUTS(entity_string); - if (term) - PUTC((char) term); - PSRCSTOP(entity); -} - -static void put_pretty_number(HTStream *context) -{ - PSRCSTART(entity); - PUTS((context->isHex ? "&#x" : "&#")); - PUTS(entity_string); - PUTC(';'); - PSRCSTOP(entity); -} -#endif /* USE_PRETTYSRC */ - -/* Handle entity - * ------------- - * - * On entry, - * s contains the entity name zero terminated - * Bugs: - * If the entity name is unknown, the terminator is treated as - * a printable non-special character in all cases, even if it is '<' - * Bug-fix: - * Modified SGML_character() so we only come here with terminator - * as '\0' and check a FoundEntity flag. -- Foteos Macrides - * - * Modified more (for use with Lynx character translation code): - */ -static char replace_buf[64]; /* buffer for replacement strings */ -static BOOL FoundEntity = FALSE; - -static void handle_entity(HTStream *context, char term) -{ - UCode_t code; - long uck = -1; - const char *s = context->string->data; - - /* - * Handle all entities normally. - FM - */ - FoundEntity = FALSE; - if ((code = HTMLGetEntityUCValue(s)) != 0) { - /* - * We got a Unicode value for the entity name. Check for special - * Unicodes. - FM - */ - if (put_special_unicodes(context, code)) { -#ifdef USE_PRETTYSRC - if (psrc_view) { - put_pretty_entity(context, term); - } -#endif - FoundEntity = TRUE; - return; - } - /* - * Seek a translation from the chartrans tables. - */ - if ((uck = UCTransUniChar(code, context->outUCLYhndl)) >= 32 && -/* =============== work in ASCII below here =============== S/390 -- gil -- 0672 */ - uck < 256 && - (uck < 127 || - uck >= LYlowest_eightbit[context->outUCLYhndl])) { -#ifdef USE_PRETTYSRC - if (psrc_view) { - put_pretty_entity(context, term); - } else -#endif - PUTC(FROMASCII((char) uck)); - FoundEntity = TRUE; - return; - } else if ((uck == -4 || - (context->T.repl_translated_C0 && - uck > 0 && uck < 32)) && - /* - * Not found; look for replacement string. - */ - (uck = UCTransUniCharStr(replace_buf, 60, code, - context->outUCLYhndl, 0) >= 0)) { -#ifdef USE_PRETTYSRC - if (psrc_view) { - put_pretty_entity(context, term); - } else -#endif - PUTS(replace_buf); - FoundEntity = TRUE; - return; - } - /* - * If we're displaying UTF-8, try that now. - FM - */ -#ifndef USE_PRETTYSRC - if (context->T.output_utf8 && PUTUTF8(code)) { - FoundEntity = TRUE; - return; - } -#else - if (context->T.output_utf8 && (psrc_view - ? (UCPutUtf8_charstring((HTStream *) context->target, - (putc_func_t *) (fake_put_character), - code)) - : PUTUTF8(code))) { - - if (psrc_view) { - put_pretty_entity(context, term); - } - - FoundEntity = TRUE; - return; - } -#endif - /* - * If it's safe ASCII, use it. - FM - */ - if (code >= 32 && code < 127) { -#ifdef USE_PRETTYSRC - if (psrc_view) { - put_pretty_entity(context, term); - } else -#endif - - PUTC(FROMASCII((char) code)); - FoundEntity = TRUE; - return; - } -/* =============== work in ASCII above here =============== S/390 -- gil -- 0682 */ - /* - * Ignore zwnj (8204) and zwj (8205), if we get to here. Note that - * zwnj may have been handled as <WBR> by the calling function. - FM - */ - if (!strcmp(s, "zwnj") || - !strcmp(s, "zwj")) { - CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s)); -#ifdef USE_PRETTYSRC - if (psrc_view) { - put_pretty_entity(context, term); - } -#endif - FoundEntity = TRUE; - return; - } - /* - * Ignore lrm (8206), and rln (8207), if we get to here. - FM - */ - if (!strcmp(s, "lrm") || - !strcmp(s, "rlm")) { - CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s)); -#ifdef USE_PRETTYSRC - if (psrc_view) { - put_pretty_entity(context, term); - } -#endif - FoundEntity = TRUE; - return; - } - } - - /* - * If entity string not found, display as text. - */ -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTART(badseq); -#endif - CTRACE((tfp, "SGML: Unknown entity '%s' %" PRI_UCode_t " %ld\n", s, code, uck)); /* S/390 -- gil -- 0695 */ - PUTC('&'); - PUTS(s); - if (term != '\0') - PUTC(term); -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTOP(badseq); -#endif -} - -/* Handle comment - * -------------- - */ -static void handle_comment(HTStream *context) -{ - const char *s = context->string->data; - - CTRACE((tfp, "SGML Comment:\n<%s>\n", s)); - - if (context->csi == NULL && - strncmp(s, "!--#", 4) == 0 && - LYCheckForCSI(context->node_anchor, &context->url) == TRUE) { - LYDoCSI(context->url, s, &context->csi); - } else { - LYCommentHacks(context->node_anchor, context->string->data); - } - - return; -} - -/* Handle identifier - * ----------------- - */ -static void handle_identifier(HTStream *context) -{ - const char *s = context->string->data; - - CTRACE((tfp, "SGML Identifier:\n<%s>\n", s)); - - return; -} - -/* Handle doctype - * -------------- - */ -static void handle_doctype(HTStream *context) -{ - const char *s = context->string->data; - - CTRACE((tfp, "SGML Doctype:\n<%s>\n", s)); - if (strstr(s, "DTD XHTML ") != 0) { - CTRACE((tfp, "...processing extended HTML\n")); - context->extended_html = TRUE; - } - - return; -} - -/* Handle marked - * ------------- - */ -static void handle_marked(HTStream *context) -{ - const char *s = context->string->data; - - CTRACE((tfp, "SGML Marked Section:\n<%s>\n", s)); - - if (!strncmp(context->string->data, "![INCLUDE[", 10)) { - context->string->data[context->string->size - 3] = '\0'; - StrAllocCat(context->include, context->string->data + 10); - /* @@@ This needs to take charset into account! @@@ - the wrong assumptions will be made about the data's - charset once it is in include - kw */ - - } else if (!strncmp(context->string->data, "![CDATA[", 8)) { - (*context->actions->put_block) (context->target, - context->string->data + 8, - context->string->size - 11); - - } - return; -} - -/* Handle processing instruction - * ----------------------------- - */ -static void handle_processing_instruction(HTStream *context) -{ - const char *s = context->string->data; - - CTRACE((tfp, "SGML Processing instruction:\n<%s>\n", s)); - - if (!strncmp(s, "?xml ", 5)) { - int flag = context->T.decode_utf8; - - context->strict_xml = TRUE; - /* - * Switch to UTF-8 if the encoding is explicitly "utf-8". - */ - if (!flag) { - char *t = strstr(s, "encoding="); - - if (t != 0) { - t += 9; - if (*t == '"') - ++t; - flag = !strncmp(t, "utf-8", 5); - } - if (flag) { - CTRACE((tfp, "...Use UTF-8 for XML\n")); - context->T.decode_utf8 = TRUE; - } - } - } - - return; -} - -/* Handle sgmlent - * -------------- - */ -static void handle_sgmlent(HTStream *context) -{ - const char *s = context->string->data; - - CTRACE((tfp, "SGML Entity Declaration:\n<%s>\n", s)); - - return; -} - -/* Handle sgmlent - * -------------- - */ -static void handle_sgmlele(HTStream *context) -{ - const char *s = context->string->data; - - CTRACE((tfp, "SGML Element Declaration:\n<%s>\n", s)); - - return; -} - -/* Handle sgmlatt - * -------------- - */ -static void handle_sgmlatt(HTStream *context) -{ - const char *s = context->string->data; - - CTRACE((tfp, "SGML Attribute Declaration:\n<%s>\n", s)); - - return; -} - -/* - * Convenience macros - tags (elements) are identified sometimes by an int or - * enum value ('TAGNUM'), sometimes by a pointer to HTTag ('TAGP'). - kw - */ -#define TAGNUM_OF_TAGP(t) (HTMLElement) (t - context->dtd->tags) -#define TAGP_OF_TAGNUM(e) (context->dtd->tags + e) - -/* - * The following implement special knowledge about OBJECT. As long as - * HTML_OBJECT is the only tag for which an alternative variant exist, they can - * be simple macros. - kw - */ -/* does 'TAGNUM' e have an alternative (variant) parsing mode? */ -#define HAS_ALT_TAGNUM(e) (e == HTML_OBJECT) - -/* return 'TAGNUM' of the alternative mode for 'TAGNUM' e, if any. */ -#define ALT_TAGNUM(e) ((e == HTML_OBJECT) ? HTML_ALT_OBJECT : e) - -/* return 'TAGNUM' of the normal mode for 'TAGNUM' e which may be alt. */ -#define NORMAL_TAGNUM(e) (((int)(e) >= HTML_ELEMENTS) ? HTML_OBJECT : (HTMLElement)e) - -/* More convenience stuff. - kw */ -#define ALT_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(ALT_TAGNUM(e)) -#define NORMAL_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(NORMAL_TAGNUM(e)) - -#define ALT_TAGP(t) ALT_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t)) -#define NORMAL_TAGP(t) NORMAL_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t)) - -static BOOL element_valid_within(HTTag * new_tag, HTTag * stacked_tag, BOOL direct) -{ - BOOL result = YES; - TagClass usecontains, usecontained; - - if (stacked_tag && new_tag) { - usecontains = (direct ? stacked_tag->contains : stacked_tag->icontains); - usecontained = (direct ? new_tag->contained : new_tag->icontained); - if (new_tag == stacked_tag) { - result = (BOOL) ((Tgc_same & usecontains) && - (Tgc_same & usecontained)); - } else { - result = (BOOL) ((new_tag->tagclass & usecontains) && - (stacked_tag->tagclass & usecontained)); - } - } - return result; -} - -typedef enum { - close_NO = 0, - close_error = 1, - close_valid = 2 -} canclose_t; - -static canclose_t can_close(HTTag * new_tag, HTTag * stacked_tag) -{ - canclose_t result; - - if (!stacked_tag) { - result = close_NO; - } else if (stacked_tag->flags & Tgf_endO) { - result = close_valid; - } else if (new_tag == stacked_tag) { - result = ((Tgc_same & new_tag->canclose) - ? close_error - : close_NO); - } else { - result = ((stacked_tag->tagclass & new_tag->canclose) - ? close_error - : close_NO); - } - return result; -} - -static void do_close_stacked(HTStream *context) -{ - HTElement *stacked = context->element_stack; - HTMLElement e; - - if (!stacked) - return; /* stack was empty */ - if (context->inSELECT && !strcasecomp(stacked->tag->name, "SELECT")) { - context->inSELECT = FALSE; - } - e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(stacked->tag)); -#ifdef USE_PRETTYSRC - if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */ -#endif - (*context->actions->end_element) (context->target, - e, - &context->include); - context->element_stack = stacked->next; - pool_free(stacked); - context->no_lynx_specialcodes = - (BOOL) (context->element_stack - ? (context->element_stack->tag->flags & Tgf_nolyspcl) - : NO); -} - -static int is_on_stack(HTStream *context, HTTag * old_tag) -{ - HTElement *stacked = context->element_stack; - int i = 1; - - for (; stacked; stacked = stacked->next, i++) { - if (stacked->tag == old_tag || - stacked->tag == ALT_TAGP(old_tag)) - return i; - } - return 0; -} - -/* End element - * ----------- - */ -static void end_element(HTStream *context, HTTag * old_tag) -{ - BOOL extra_action_taken = NO; - canclose_t canclose_check = close_valid; - int stackpos = is_on_stack(context, old_tag); - - if (!Old_DTD) { - while (canclose_check != close_NO && - context->element_stack && - (stackpos > 1 || (!extra_action_taken && stackpos == 0))) { - if (stackpos == 0 && (old_tag->flags & Tgf_startO) && - element_valid_within(old_tag, context->element_stack->tag, YES)) { - CTRACE((tfp, "SGML: </%s> ignored\n", old_tag->name)); - return; - } - canclose_check = can_close(old_tag, context->element_stack->tag); - if (canclose_check != close_NO) { - CTRACE((tfp, "SGML: End </%s> \t<- %s end </%s>\n", - context->element_stack->tag->name, - ((canclose_check == close_valid) - ? "supplied," - : "***forced by"), - old_tag->name)); - do_close_stacked(context); - extra_action_taken = YES; - stackpos = is_on_stack(context, old_tag); - } - } - - if (stackpos == 0 && old_tag->contents != SGML_EMPTY) { - CTRACE((tfp, "SGML: Still open %s, ***no open %s for </%s>\n", - context->element_stack ? - context->element_stack->tag->name : "none", - old_tag->name, - old_tag->name)); - return; - } - if (stackpos > 1) { - CTRACE((tfp, - "SGML: Nesting <%s>...<%s> \t<- ***invalid end </%s>\n", - old_tag->name, - context->element_stack->tag->name, - old_tag->name)); - return; - } - } - /* Now let the non-extended code deal with the rest. - kw */ - - /* - * If we are in a SELECT block, ignore anything but a SELECT end tag. - FM - */ - if (context->inSELECT) { - if (!strcasecomp(old_tag->name, "SELECT")) { - /* - * Turn off the inSELECT flag and fall through. - FM - */ - context->inSELECT = FALSE; - } else { - /* - * Ignore the end tag. - FM - */ - CTRACE((tfp, "SGML: ***Ignoring end tag </%s> in SELECT block.\n", - old_tag->name)); - return; - } - } - /* - * Handle the end tag. - FM - */ - CTRACE((tfp, "SGML: End </%s>\n", old_tag->name)); - if (old_tag->contents == SGML_EMPTY) { - CTRACE((tfp, "SGML: ***Illegal end tag </%s> found.\n", - old_tag->name)); - return; - } -#ifdef WIND_DOWN_STACK - while (context->element_stack) /* Loop is error path only */ -#else - if (context->element_stack) /* Substitute and remove one stack element */ -#endif /* WIND_DOWN_STACK */ - { - int status = HT_OK; - HTMLElement e; - HTElement *N = context->element_stack; - HTTag *t = (N->tag != old_tag) ? NORMAL_TAGP(N->tag) : N->tag; - - if (old_tag != t) { /* Mismatch: syntax error */ - if (context->element_stack->next) { /* This is not the last level */ - CTRACE((tfp, - "SGML: Found </%s> when expecting </%s>. </%s> ***assumed.\n", - old_tag->name, t->name, t->name)); - } else { /* last level */ - CTRACE((tfp, - "SGML: Found </%s> when expecting </%s>. </%s> ***Ignored.\n", - old_tag->name, t->name, old_tag->name)); - return; /* Ignore */ - } - } - - e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(t)); - CTRACE2(TRACE_SGML, (tfp, "tagnum(%p) = %d\n", (void *) t, (int) e)); -#ifdef USE_PRETTYSRC - if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */ -#endif - status = (*context->actions->end_element) (context->target, - e, &context->include); - if (status == HT_PARSER_REOPEN_ELT) { - CTRACE((tfp, "SGML: Restart <%s>\n", t->name)); - (*context->actions->start_element) (context->target, - e, - NULL, - NULL, - context->current_tag_charset, - &context->include); - } else if (status == HT_PARSER_OTHER_CONTENT) { - CTRACE((tfp, "SGML: Continue with other content model for <%s>\n", t->name)); - context->element_stack->tag = ALT_TAGP_OF_TAGNUM(e); - } else { - context->element_stack = N->next; /* Remove from stack */ - pool_free(N); - } - context->no_lynx_specialcodes = - (BOOL) (context->element_stack - ? (context->element_stack->tag->flags & Tgf_nolyspcl) - : NO); -#ifdef WIND_DOWN_STACK - if (old_tag == t) - return; /* Correct sequence */ -#else - return; -#endif /* WIND_DOWN_STACK */ - - /* Syntax error path only */ - - } - CTRACE((tfp, "SGML: Extra end tag </%s> found and ignored.\n", - old_tag->name)); -} - -/* Start a element -*/ -static void start_element(HTStream *context) -{ - int status; - HTTag *new_tag = context->current_tag; - HTMLElement e = TAGNUM_OF_TAGP(new_tag); - BOOL ok = FALSE; - - BOOL valid = YES; - BOOL direct_container = YES; - BOOL extra_action_taken = NO; - canclose_t canclose_check = close_valid; - - if (!Old_DTD) { - while (context->element_stack && - (canclose_check == close_valid || - (canclose_check == close_error && - new_tag == context->element_stack->tag)) && - !(valid = element_valid_within(new_tag, - context->element_stack->tag, - direct_container))) { - canclose_check = can_close(new_tag, context->element_stack->tag); - if (canclose_check != close_NO) { - CTRACE((tfp, "SGML: End </%s> \t<- %s start <%s>\n", - context->element_stack->tag->name, - ((canclose_check == close_valid) - ? "supplied," - : "***forced by"), - new_tag->name)); - do_close_stacked(context); - extra_action_taken = YES; - if (canclose_check == close_error) - direct_container = NO; - } else { - CTRACE((tfp, - "SGML: Still open %s \t<- ***invalid start <%s>\n", - context->element_stack->tag->name, - new_tag->name)); - } - } - if (context->element_stack && !valid && - (context->element_stack->tag->flags & Tgf_strict) && - !(valid = element_valid_within(new_tag, - context->element_stack->tag, - direct_container))) { - CTRACE((tfp, "SGML: Still open %s \t<- ***ignoring start <%s>\n", - context->element_stack->tag->name, - new_tag->name)); - return; - } - - if (context->element_stack && - !extra_action_taken && - (canclose_check == close_NO) && - !valid && (new_tag->flags & Tgf_mafse)) { - BOOL has_attributes = NO; - int i = 0; - - for (; i < new_tag->number_of_attributes && !has_attributes; i++) - has_attributes = context->present[i]; - if (!has_attributes) { - CTRACE((tfp, - "SGML: Still open %s, ***converting invalid <%s> to </%s>\n", - context->element_stack->tag->name, - new_tag->name, - new_tag->name)); - end_element(context, new_tag); - return; - } - } - - if (context->element_stack && - (canclose_check == close_error) && - !(valid = element_valid_within(new_tag, - context->element_stack->tag, - direct_container))) { - CTRACE((tfp, "SGML: Still open %s \t<- ***invalid start <%s>\n", - context->element_stack->tag->name, - new_tag->name)); - } - } - /* Fall through to the non-extended code - kw */ - - /* - * If we are not in a SELECT block, check if this is a SELECT start tag. - * Otherwise (i.e., we are in a SELECT block) accept only OPTION as valid, - * terminate the SELECT block if it is any other form-related element, and - * otherwise ignore it. - FM - */ - if (!context->inSELECT) { - /* - * We are not in a SELECT block, so check if this starts one. - FM - * (frequent case!) - */ - /* my_casecomp() - optimized by the first character */ - if (!my_casecomp(new_tag->name, "SELECT")) { - /* - * Set the inSELECT flag and fall through. - FM - */ - context->inSELECT = TRUE; - } - } else { - /* - * We are in a SELECT block. - FM - */ - if (strcasecomp(new_tag->name, "OPTION")) { - /* - * Ugh, it is not an OPTION. - FM - */ - switch (e) { - case HTML_INPUT: - case HTML_TEXTAREA: - case HTML_SELECT: - case HTML_BUTTON: - case HTML_FIELDSET: - case HTML_LABEL: - case HTML_LEGEND: - case HTML_FORM: - ok = TRUE; - break; - default: - break; - } - if (ok) { - /* - * It is another form-related start tag, so terminate the - * current SELECT block and fall through. - FM - */ - CTRACE((tfp, - "SGML: ***Faking SELECT end tag before <%s> start tag.\n", - new_tag->name)); - end_element(context, SGMLFindTag(context->dtd, "SELECT")); - } else { - /* - * Ignore the start tag. - FM - */ - CTRACE((tfp, - "SGML: ***Ignoring start tag <%s> in SELECT block.\n", - new_tag->name)); - return; - } - } - } - /* - * Handle the start tag. - FM - */ - CTRACE((tfp, "SGML: Start <%s>\n", new_tag->name)); - status = (*context->actions->start_element) (context->target, - TAGNUM_OF_TAGP(new_tag), - context->present, - (const char **) context->value, /* coerce type for think c */ - context->current_tag_charset, - &context->include); - if (status == HT_PARSER_OTHER_CONTENT) - new_tag = ALT_TAGP(new_tag); /* this is only returned for OBJECT */ - if (new_tag->contents != SGML_EMPTY) { /* i.e., tag not empty */ - HTElement *N = pool_alloc(); - - if (N == NULL) - outofmem(__FILE__, "start_element"); - N->next = context->element_stack; - N->tag = new_tag; - context->element_stack = N; - context->no_lynx_specialcodes = (BOOLEAN) (new_tag->flags & Tgf_nolyspcl); - - } else if (e == HTML_META) { - /* - * Check for result of META tag. - KW & FM - */ - change_chartrans_handling(context); - } -} - -/* Find Tag in DTD tag list - * ------------------------ - * - * On entry, - * dtd points to dtd structure including valid tag list - * string points to name of tag in question - * - * On exit, - * returns: - * NULL tag not found - * else address of tag structure in dtd - */ -HTTag *SGMLFindTag(const SGML_dtd * dtd, - const char *s) -{ - int high, low, i, diff; - static HTTag *last[64] = - {NULL}; /*optimize using the previous results */ - HTTag **res = last + (UCH(*s) % 64); /*pointer arithmetic */ - - if (*res) { - if ((*res)->name == NULL) - return NULL; - if (!strcasecomp((*res)->name, s)) - return *res; - } - - for (low = 0, high = dtd->number_of_tags; - high > low; - diff < 0 ? (low = i + 1) : (high = i)) { /* Binary search */ - i = (low + (high - low) / 2); - /* my_casecomp() - optimized by the first character, NOT_ASCII ok */ - diff = my_casecomp(dtd->tags[i].name, s); /* Case insensitive */ - if (diff == 0) { /* success: found it */ - *res = &dtd->tags[i]; - return *res; - } - } - if (IsNmStart(*s)) { - /* - * Unrecognized, but may be valid. - KW - */ - return &HTTag_unrecognized; - } - return NULL; -} - -/*________________________________________________________________________ - * Public Methods - */ - -/* Could check that we are back to bottom of stack! @@ */ -/* Do check! - FM */ -/* */ -static void SGML_free(HTStream *context) -{ - int i; - HTElement *cur; - HTTag *t; - - /* - * Free the buffers. - FM - */ - FREE(context->recover); - FREE(context->url); - FREE(context->csi); - FREE(context->include); - FREE(context->active_include); - - /* - * Wind down stack if any elements are open. - FM - */ - while (context->element_stack) { - cur = context->element_stack; - t = cur->tag; - context->element_stack = cur->next; /* Remove from stack */ - pool_free(cur); -#ifdef USE_PRETTYSRC - if (!psrc_view) /* Don't actually call on target if viewing psrc - kw */ -#endif - (*context->actions->end_element) (context->target, - NORMAL_TAGNUM(TAGNUM_OF_TAGP(t)), - &context->include); - FREE(context->include); - } - - /* - * Finish off the target. - FM - */ - (*context->actions->_free) (context->target); - - /* - * Free the strings and context structure. - FM - */ - HTChunkFree(context->string); - for (i = 0; i < MAX_ATTRIBUTES; i++) - FREE_extra(context->value[i]); - FREE(context); - -#ifdef USE_PRETTYSRC - sgml_in_psrc_was_initialized = FALSE; -#endif -} - -static void SGML_abort(HTStream *context, HTError e) -{ - int i; - HTElement *cur; - - /* - * Abort the target. - FM - */ - (*context->actions->_abort) (context->target, e); - - /* - * Free the buffers. - FM - */ - FREE(context->recover); - FREE(context->include); - FREE(context->active_include); - FREE(context->url); - FREE(context->csi); - - /* - * Free stack memory if any elements were left open. - KW - */ - while (context->element_stack) { - cur = context->element_stack; - context->element_stack = cur->next; /* Remove from stack */ - pool_free(cur); - } - - /* - * Free the strings and context structure. - FM - */ - HTChunkFree(context->string); - for (i = 0; i < MAX_ATTRIBUTES; i++) - FREE_extra(context->value[i]); - FREE(context); - -#ifdef USE_PRETTYSRC - sgml_in_psrc_was_initialized = FALSE; -#endif -} - -/* Read and write user callback handle - * ----------------------------------- - * - * The callbacks from the SGML parser have an SGML context parameter. - * These calls allow the caller to associate his own context with a - * particular SGML context. - */ - -#ifdef CALLERDATA -void *SGML_callerData(HTStream *context) -{ - return context->callerData; -} - -void SGML_setCallerData(HTStream *context, void *data) -{ - context->callerData = data; -} -#endif /* CALLERDATA */ - -#ifdef USE_PRETTYSRC -static void transform_tag(HTStream *context, HTChunk *string) -{ - if (!context->strict_xml) { - if (tagname_transform != 1) { - if (tagname_transform == 0) - LYLowerCase(string->data); - else - LYUpperCase(string->data); - } - } -} -#endif /* USE_PRETTYSRC */ - -static BOOL ignore_when_empty(HTTag * tag) -{ - BOOL result = FALSE; - - if (!LYPreparsedSource - && LYxhtml_parsing - && tag->name != 0 - && !(tag->flags & Tgf_mafse) - && tag->contents != SGML_EMPTY - && tag->tagclass != Tgc_Plike - && (tag->tagclass == Tgc_SELECTlike - || (tag->contains && tag->icontains))) { - result = TRUE; - } - CTRACE((tfp, "SGML Do%s ignore_when_empty:%s\n", - result ? "" : " not", - NonNull(tag->name))); - return result; -} - -static void discard_empty(HTStream *context) -{ - static HTTag empty_tag; - - CTRACE((tfp, "SGML discarding empty %s\n", - NonNull(context->current_tag->name))); - CTRACE_FLUSH(tfp); - - memset(&empty_tag, 0, sizeof(empty_tag)); - context->current_tag = &empty_tag; - context->string->size = 0; - - /* do not call end_element() if start_element() was not called */ -} - -#ifdef USE_PRETTYSRC -static BOOL end_if_prettysrc(HTStream *context, HTChunk *string, char end_ch) -{ - BOOL result = psrc_view; - - if (psrc_view) { - if (attr_is_name) { - HTStartAnchor(context->target, string->data, NULL); - (*context->actions->end_element) (context->target, - HTML_A, - &context->include); - } else if (attr_is_href) { - PSRCSTART(href); - HTStartAnchor(context->target, NULL, string->data); - } - PUTS_TR(string->data); - if (attr_is_href) { - (*context->actions->end_element) (context->target, - HTML_A, - &context->include); - PSRCSTOP(href); - } - if (end_ch) - PUTC(end_ch); - PSRCSTOP(attrval); - } - return result; -} -#endif - -static void SGML_character(HTStream *context, char c_in) -{ - const SGML_dtd *dtd = context->dtd; - HTChunk *string = context->string; - const char *EntityName; - HTTag *testtag = NULL; - BOOLEAN chk; /* Helps (?) walk through all the else ifs... */ - UCode_t clong, uck = 0; /* Enough bits for UCS4 ... */ - int testlast; - -#ifdef CJK_EX - unsigned char c; - -#else - char c; -#endif - char saved_char_in = '\0'; - - ++sgml_offset; - - /* - * Now some fun with the preprocessor. Use copies for c and unsign_c == - * clong, so that we can revert back to the unchanged c_in. - KW - */ -#define unsign_c clong - - c = c_in; - clong = UCH(c); /* a.k.a. unsign_c */ - - if (context->T.decode_utf8) { - /* - * Combine UTF-8 into Unicode. Incomplete characters silently ignored. - * From Linux kernel's console.c. - KW - */ - if (TOASCII(UCH(c)) > 127) { /* S/390 -- gil -- 0710 */ - /* - * We have an octet from a multibyte character. - FM - */ - if (context->utf_count > 0 && (TOASCII(c) & 0xc0) == 0x80) { - context->utf_char = (context->utf_char << 6) | (TOASCII(c) & 0x3f); - context->utf_count--; - *(context->utf_buf_p) = c; - (context->utf_buf_p)++; - if (context->utf_count == 0) { - /* - * We have all of the bytes, so terminate the buffer and - * set 'clong' to the UCode_t value. - FM - */ - *(context->utf_buf_p) = '\0'; - clong = context->utf_char; - if (clong < 256) { - c = ((char) (clong & 0xff)); - } - /* lynx does not use left-to-right */ - if (clong == 0x200e) - return; - goto top1; - } else { - /* - * Wait for more. - KW - */ - return; - } - } else { - /* - * Start handling a new multibyte character. - FM - */ - context->utf_buf_p = context->utf_buf; - *(context->utf_buf_p) = c; - (context->utf_buf_p)++; - if ((c & 0xe0) == 0xc0) { - context->utf_count = 1; - context->utf_char = (c & 0x1f); - } else if ((c & 0xf0) == 0xe0) { - context->utf_count = 2; - context->utf_char = (c & 0x0f); - } else if ((c & 0xf8) == 0xf0) { - context->utf_count = 3; - context->utf_char = (c & 0x07); - } else if ((c & 0xfc) == 0xf8) { - context->utf_count = 4; - context->utf_char = (c & 0x03); - } else if ((c & 0xfe) == 0xfc) { - context->utf_count = 5; - context->utf_char = (c & 0x01); - } else { - /* - * Garbage. - KW - */ - context->utf_count = 0; - context->utf_buf_p = context->utf_buf; - *(context->utf_buf_p) = '\0'; - } - /* - * Wait for more. - KW - */ - return; - } - } else { - /* - * Got an ASCII char. - KW - */ - context->utf_count = 0; - context->utf_buf_p = context->utf_buf; - *(context->utf_buf_p) = '\0'; - /* goto top; */ - } - } - /* end of context->T.decode_utf8 S/390 -- gil -- 0726 */ -#ifdef NOTDEFINED - /* - * If we have a koi8-r input and do not have koi8-r as the output, save the - * raw input in saved_char_in before we potentially convert it to Unicode. - * - FM - */ - if (context->T.strip_raw_char_in) - saved_char_in = c; -#endif /* NOTDEFINED */ - - /* - * If we want the raw input converted to Unicode, try that now. - FM - */ - if (context->T.trans_to_uni && -#ifdef EXP_JAPANESEUTF8_SUPPORT - ((strcmp(LYCharSet_UC[context->inUCLYhndl].MIMEname, "euc-jp") == 0) || - (strcmp(LYCharSet_UC[context->inUCLYhndl].MIMEname, "shift_jis") == 0))) { - if (strcmp(LYCharSet_UC[context->inUCLYhndl].MIMEname, "shift_jis") == 0) { - if (context->utf_count == 0) { - if (IS_SJIS_HI1((unsigned char) c) || - IS_SJIS_HI2((unsigned char) c)) { - context->utf_buf[0] = c; - context->utf_count = 1; - clong = -11; - } - } else { - if (IS_SJIS_LO((unsigned char) c)) { - context->utf_buf[1] = c; - clong = UCTransJPToUni(context->utf_buf, 2, context->inUCLYhndl); - } - context->utf_count = 0; - } - } else { - if (context->utf_count == 0) { - if (IS_EUC_HI((unsigned char) c)) { - context->utf_buf[0] = c; - context->utf_count = 1; - clong = -11; - } - } else { - if (IS_EUC_LOX((unsigned char) c)) { - context->utf_buf[1] = c; - clong = UCTransJPToUni(context->utf_buf, 2, context->inUCLYhndl); - } - context->utf_count = 0; - } - } - goto top1; - } else if (context->T.trans_to_uni && -#endif - ((TOASCII(unsign_c) >= LYlowest_eightbit[context->inUCLYhndl]) || /* S/390 -- gil -- 0744 */ - (unsign_c < ' ' && unsign_c != 0 && - context->T.trans_C0_to_uni))) { - /* - * Convert the octet to Unicode. - FM - */ - clong = UCTransToUni(c, context->inUCLYhndl); - if (clong > 0) { - saved_char_in = c; - if (clong < 256) { - c = FROMASCII((char) clong); - } - } - goto top1; - } else if (unsign_c < ' ' && unsign_c != 0 && /* S/390 -- gil -- 0768 */ - context->T.trans_C0_to_uni) { - /* - * This else if may be too ugly to keep. - KW - */ - if (context->T.trans_from_uni && - (((clong = UCTransToUni(c, context->inUCLYhndl)) >= ' ') || - (context->T.transp && - (clong = UCTransToUni(c, context->inUCLYhndl)) > 0))) { - saved_char_in = c; - if (clong < 256) { - c = FROMASCII((char) clong); - } - goto top1; - } else { - uck = -1; - if (context->T.transp) { - uck = UCTransCharStr(replace_buf, 60, c, - context->inUCLYhndl, - context->inUCLYhndl, NO); - } - if (!context->T.transp || uck < 0) { - uck = UCTransCharStr(replace_buf, 60, c, - context->inUCLYhndl, - context->outUCLYhndl, YES); - } - if (uck == 0) { - return; - } else if (uck < 0) { - goto top0a; - } - c = replace_buf[0]; - if (c && replace_buf[1]) { - if (context->state == S_text) { - PUTS(replace_buf); - return; - } - StrAllocCat(context->recover, replace_buf + 1); - } - goto top0a; - } /* Next line end of ugly stuff for C0. - KW */ - } else { /* end of context->T.trans_to_uni S/390 -- gil -- 0791 */ - goto top0a; - } - - /* - * At this point we have either unsign_c a.k.a. clong in Unicode (and c in - * latin1 if clong is in the latin1 range), or unsign_c and c will have to - * be passed raw. - KW - */ -/* - * We jump up to here from below if we have - * stuff in the recover, insert, or csi buffers - * to process. We zero saved_char_in, in effect - * as a flag that the octet is not that of the - * actual call to this function. This may be OK - * for now, for the stuff this function adds to - * its recover buffer, but it might not be for - * stuff other functions added to the insert or - * csi buffer, so bear that in mind. - FM - * Stuff from the recover buffer is now handled - * as UTF-8 if we can expect that's what it is, - * and in that case we don't come back up here. - kw - */ - top: - saved_char_in = '\0'; -/* - * We jump to here from above when we don't have - * UTF-8 input, haven't converted to Unicode, and - * want clong set to the input octet (unsigned) - * without zeroing its saved_char_in copy (which - * is signed). - FM - */ - top0a: - *(context->utf_buf) = '\0'; - clong = UCH(c); -/* - * We jump to here from above if we have converted - * the input, or a multibyte sequence across calls, - * to a Unicode value and loaded it into clong (to - * which unsign_c has been defined), and from below - * when we are recycling a character (e.g., because - * it terminated an entity but is not the standard - * semi-colon). The character will already have - * been put through the Unicode conversions. - FM - */ - top1: - /* - * Ignore low ISO 646 7-bit control characters if HTCJK is not set. - FM - */ - /* - * Works for both ASCII and EBCDIC. -- gil - * S/390 -- gil -- 0811 - */ - if (TOASCII(unsign_c) < 32 && - c != '\t' && c != '\n' && c != '\r' && - !IS_CJK_TTY) - goto after_switch; - - /* - * Ignore 127 if we don't have HTPassHighCtrlRaw or HTCJK set. - FM - */ -#define PASSHICTRL (context->T.transp || \ - unsign_c >= LYlowest_eightbit[context->inUCLYhndl]) - if (TOASCII(c) == 127 && /* S/390 -- gil -- 0830 */ - !(PASSHICTRL || IS_CJK_TTY)) - goto after_switch; - - /* - * Ignore 8-bit control characters 128 - 159 if neither HTPassHighCtrlRaw - * nor HTCJK is set. - FM - */ - if (TOASCII(unsign_c) > 127 && TOASCII(unsign_c) < 160 && /* S/390 -- gil -- 0847 */ - !(PASSHICTRL || IS_CJK_TTY)) { - /* - * If we happen to be reading from an "ISO-8859-1" or "US-ASCII" - * document, allow the cp-1252 codes, to accommodate the HTML5 draft - * recommendation for replacement encoding: - * - * http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0 - */ - if (context->inUCLYhndl == LATIN1 - || context->inUCLYhndl == US_ASCII) { - clong = LYcp1252ToUnicode(c); - goto top1; - } - goto after_switch; - } - - /* Almost all CJK characters are double byte but only Japanese - * JIS X0201 Kana is single byte. To prevent to fail SGML parsing - * we have to take care of them here. -- TH - */ - if ((HTCJK == JAPANESE) && (context->state == S_in_kanji) && - !IS_JAPANESE_2BYTE(context->kanji_buf, UCH(c)) -#ifdef EXP_JAPANESEUTF8_SUPPORT - && !context->T.decode_utf8 -#endif - ) { -#ifdef CONV_JISX0201KANA_JISX0208KANA - if (IS_SJIS_X0201KANA(context->kanji_buf)) { - unsigned char sjis_hi, sjis_lo; - - JISx0201TO0208_SJIS(context->kanji_buf, &sjis_hi, &sjis_lo); - PUTC(sjis_hi); - PUTC(sjis_lo); - } else -#endif - PUTC(context->kanji_buf); - context->state = S_text; - } - - /* - * Handle character based on context->state. - */ - CTRACE2(TRACE_SGML, (tfp, "SGML before %s|%.*s|%c|\n", - state_name(context->state), - string->size, - NonNull(string->data), - UCH(c))); - switch (context->state) { - - case S_in_kanji: - /* - * Note that if we don't have a CJK input, then this is not the second - * byte of a CJK di-byte, and we're trashing the input. That's why - * 8-bit characters followed by, for example, '<' can cause the tag to - * be treated as text, not markup. We could try to deal with it by - * holding each first byte and then checking byte pairs, but that - * doesn't seem worth the overhead (see below). - FM - */ - context->state = S_text; - PUTC(context->kanji_buf); - PUTC(c); - break; - - case S_tagname_slash: - /* - * We had something link "<name/" so far, set state to S_text but keep - * context->slashedtag as a flag; except if we get '>' directly - * after the "<name/", and really have a tag for that name in - * context->slashedtag, in which case keep state as is and let code - * below deal with it. - kw - */ - if (!(c == '>' && context->slashedtag && TOASCII(unsign_c) < 127)) { - context->state = S_text; - } - /* fall through in any case! */ - case S_text: - if (IS_CJK_TTY && ((TOASCII(c) & 0200) != 0) -#ifdef EXP_JAPANESEUTF8_SUPPORT - && !context->T.decode_utf8 -#endif - ) { /* S/390 -- gil -- 0864 */ - /* - * Setting up for Kanji multibyte handling (based on Takuya ASADA's - * (asada@three-a.co.jp) CJK Lynx). Note that if the input is not - * in fact CJK, the next byte also will be mishandled, as explained - * above. Toggle raw mode off in such cases, or select the "7 bit - * approximations" display character set, which is largely - * equivalent to having raw mode off with CJK. - FM - */ - context->state = S_in_kanji; - context->kanji_buf = c; - break; - } else if (IS_CJK_TTY && TOASCII(c) == '\033') { /* S/390 -- gil -- 0881 */ - /* - * Setting up for CJK escape sequence handling (based on Takuya - * ASADA's (asada@three-a.co.jp) CJK Lynx). - FM - */ - context->state = S_esc; - PUTC(c); - break; - } - - if (c == '&' || c == '<') { -#ifdef USE_PRETTYSRC - if (psrc_view) { /*there is nothing useful in the element_stack */ - testtag = context->current_tag; - } else -#endif - { - testtag = context->element_stack ? - context->element_stack->tag : NULL; - } - } - - if (c == '&' && TOASCII(unsign_c) < 127 && /* S/390 -- gil -- 0898 */ - (!testtag || - (testtag->contents == SGML_MIXED || - testtag->contents == SGML_ELEMENT || - testtag->contents == SGML_PCDATA || -#ifdef USE_PRETTYSRC - testtag->contents == SGML_EMPTY || -#endif - testtag->contents == SGML_RCDATA))) { - /* - * Setting up for possible entity, without the leading '&'. - FM - */ - string->size = 0; - context->state = S_ero; - } else if (c == '<' && TOASCII(unsign_c) < 127) { /* S/390 -- gil -- 0915 */ - /* - * Setting up for possible tag. - FM - */ - string->size = 0; - if (testtag && testtag->contents == SGML_PCDATA) { - context->state = S_pcdata; - } else if (testtag && (testtag->contents == SGML_LITTERAL - || testtag->contents == SGML_CDATA)) { - context->state = S_litteral; - } else if (testtag && (testtag->contents == SGML_SCRIPT)) { - context->state = S_script; - } else { - context->state = S_tag; - } - context->slashedtag = NULL; - } else if (context->slashedtag && - context->slashedtag->name && - (c == '/' || - (c == '>' && context->state == S_tagname_slash)) && - TOASCII(unsign_c) < 127) { - /* - * We got either the second slash of a pending "<NAME/blah blah/" - * shortref construct, or the '>' of a mere "<NAME/>". In both - * cases generate a "</NAME>" end tag in the recover buffer for - * reparsing unless NAME is really an empty element. - kw - */ -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(abracket); - PUTC(c); - PSRCSTOP(abracket); - } else -#endif - if (context->slashedtag != context->unknown_tag && - !ReallyEmptyTag(context->slashedtag)) { - if (context->recover == NULL) { - StrAllocCopy(context->recover, "</"); - context->recover_index = 0; - } else { - StrAllocCat(context->recover, "</"); - } - StrAllocCat(context->recover, context->slashedtag->name); - StrAllocCat(context->recover, ">"); - } - context->slashedtag = NULL; - - } else if (context->element_stack && - (context->element_stack->tag->flags & Tgf_frecyc)) { - /* - * The element stack says we are within the contents of an element - * that the next stage (HTML.c) may want to feed us back again (via - * the *include string). So try to output text in UTF-8 if - * possible, using the same logic as for attribute values (which - * should be in line with what context->current_tag_charset - * indicates). - kw - */ - if (context->T.decode_utf8 && - *context->utf_buf) { - PUTS(context->utf_buf); - context->utf_buf_p = context->utf_buf; - *(context->utf_buf_p) = '\0'; - } else if (!IS_CJK_TTY && - (context->T.output_utf8 || - context->T.trans_from_uni)) { - if (LYIsASCII(clong)) { - PUTC(c); - } else if (clong == 0xfffd && saved_char_in && - HTPassEightBitRaw && - UCH(saved_char_in) >= - LYlowest_eightbit[context->outUCLYhndl]) { - PUTUTF8((0xf000 | UCH(saved_char_in))); - } else { - PUTUTF8(clong); - } - } else if (saved_char_in && context->T.use_raw_char_in) { - PUTC(saved_char_in); - } else { - PUTC(c); - } - -#define PASS8859SPECL context->T.pass_160_173_raw - /* - * Convert 160 (nbsp) to Lynx special character if neither - * HTPassHighCtrlRaw nor HTCJK is set. - FM - */ - } else if (unsign_c == CH_NBSP && /* S/390 -- gil -- 0932 */ - !context->no_lynx_specialcodes && - !(PASS8859SPECL || IS_CJK_TTY)) { - PUTC(HT_NON_BREAK_SPACE); - /* - * Convert 173 (shy) to Lynx special character if neither - * HTPassHighCtrlRaw nor HTCJK is set. - FM - */ - } else if (unsign_c == CH_SHY && /* S/390 -- gil -- 0949 */ - !context->no_lynx_specialcodes && - !(PASS8859SPECL || IS_CJK_TTY)) { - PUTC(LY_SOFT_HYPHEN); - /* - * Handle the case in which we think we have a character which - * doesn't need further processing (e.g., a koi8-r input for a - * koi8-r output). - FM - */ - } else if (context->T.use_raw_char_in && saved_char_in) { - /* - * Only if the original character is still in saved_char_in, - * otherwise we may be iterating from a goto top. - KW - */ - PUTC(saved_char_in); - saved_char_in = '\0'; -/****************************************************************** - * I. LATIN-1 OR UCS2 TO DISPLAY CHARSET - ******************************************************************/ - } else if ((chk = (BOOL) (context->T.trans_from_uni && - TOASCII(unsign_c) >= 160)) && /* S/390 -- gil -- 0968 */ - (uck = UCTransUniChar(unsign_c, - context->outUCLYhndl)) >= ' ' && - uck < 256) { - CTRACE((tfp, "UCTransUniChar returned 0x%.2lX:'%c'.\n", - uck, FROMASCII((char) uck))); - /* - * We got one octet from the conversions, so use it. - FM - */ - PUTC(FROMASCII((char) uck)); - } else if ((chk && - (uck == -4 || - (context->T.repl_translated_C0 && - uck > 0 && uck < 32))) && - /* - * Not found; look for replacement string. - KW - */ - (uck = UCTransUniCharStr(replace_buf, 60, clong, - context->outUCLYhndl, - 0) >= 0)) { - /* - * Got a replacement string. No further tests for validity - - * assume that whoever defined replacement strings knew what she - * was doing. - KW - */ - PUTS(replace_buf); - /* - * If we're displaying UTF-8, try that now. - FM - */ - } else if (context->T.output_utf8 && PUTUTF8(clong)) { - ; /* do nothing more */ - /* - * If it's any other (> 160) 8-bit character, and we have not set - * HTPassEightBitRaw nor HTCJK, nor have the "ISO Latin 1" - * character set selected, back translate for our character set. - - * FM - */ -#define IncludesLatin1Enc \ - (context->outUCLYhndl == LATIN1 || \ - (context->outUCI && \ - (context->outUCI->enc & (UCT_CP_SUPERSETOF_LAT1)))) - -#define PASSHI8BIT (HTPassEightBitRaw || \ - (context->T.do_8bitraw && !context->T.trans_from_uni)) - - } else if (unsign_c > 160 && unsign_c < 256 && - !(PASSHI8BIT || IS_CJK_TTY) && - !IncludesLatin1Enc) { -#ifdef USE_PRETTYSRC - int psrc_view_backup = 0; -#endif - - string->size = 0; - EntityName = HTMLGetEntityName((int) (unsign_c - 160)); - HTChunkPuts(string, EntityName); - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - /* we need to disable it temporarily */ - if (psrc_view) { - psrc_view_backup = 1; - psrc_view = 0; - } -#endif - handle_entity(context, '\0'); -#ifdef USE_PRETTYSRC - /* we need to disable it temporarily */ - if (psrc_view_backup) - psrc_view = TRUE; -#endif - - string->size = 0; - if (!FoundEntity) - PUTC(';'); - /* - * If we get to here and have an ASCII char, pass the character. - - * KW - */ - } else if (TOASCII(unsign_c) < 127 && unsign_c > 0) { /* S/390 -- gil -- 0987 */ - PUTC(c); - /* - * If we get to here, and should have translated, translation has - * failed so far. - KW - * - * We should have sent UTF-8 output to the parser already, but what - * the heck, try again. - FM - */ - } else if (context->T.output_utf8 && *context->utf_buf) { - PUTS(context->utf_buf); - context->utf_buf_p = context->utf_buf; - *(context->utf_buf_p) = '\0'; -#ifdef NOTDEFINED - /* - * Check for a strippable koi8-r 8-bit character. - FM - */ - } else if (context->T.strip_raw_char_in && saved_char_in && - (UCH(saved_char_in) >= 0xc0) && - (UCH(saved_char_in) < 255)) { - /* - * KOI8 special: strip high bit, gives (somewhat) readable ASCII - * or KOI7 - it was constructed that way! - KW - */ - PUTC(((char) (saved_char_in & 0x7f))); - saved_char_in = '\0'; -#endif /* NOTDEFINED */ - /* - * If we don't actually want the character, make it safe and output - * that now. - FM - */ - } else if (TOASCII(UCH(c)) < /* S/390 -- gil -- 0997 */ - LYlowest_eightbit[context->outUCLYhndl] || - (context->T.trans_from_uni && !HTPassEightBitRaw)) { - /* - * If we get to here, pass the character. - FM - */ - } else { - PUTC(c); - } - break; - - /* - * Found '<' in SGML_PCDATA content; treat this mode nearly like - * S_litteral, but recognize '<!' and '<?' to filter out comments and - * processing instructions. - kw - */ - case S_pcdata: - if (!string->size && TOASCII(unsign_c) < 127) { /* first after '<' */ - if (c == '!') { /* <! */ - /* - * Terminate and set up for possible comment, identifier, - * declaration, or marked section as under S_tag. - kw - */ - context->state = S_exclamation; - context->lead_exclamation = TRUE; - context->doctype_bracket = FALSE; - context->first_bracket = FALSE; - HTChunkPutc(string, c); - break; - } else if (c == '?') { /* <? - ignore as a PI until '>' - kw */ - CTRACE((tfp, - "SGML: Found PI in PCDATA, junking it until '>'\n")); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(abracket); - PUTS("<?"); - PSRCSTOP(abracket); - } -#endif - context->state = S_pi; - break; - } - } - goto case_S_litteral; - - /* - * Found '<' in SGML_SCRIPT content; treat this mode nearly like - * S_litteral, but recognize '<!' to allow the content to be treated as - * a comment by lynx. - */ - case S_script: - if (!string->size && TOASCII(unsign_c) < 127) { /* first after '<' */ - if (c == '!') { /* <! */ - /* - * Terminate and set up for possible comment, identifier, - * declaration, or marked section as under S_tag. - kw - */ - context->state = S_exclamation; - context->lead_exclamation = TRUE; - context->doctype_bracket = FALSE; - context->first_bracket = FALSE; - HTChunkPutc(string, c); - break; - } - } - goto case_S_litteral; - - /* - * In litteral mode, waits only for specific end tag (for compatibility - * with old servers, and for Lynx). - FM - */ - case_S_litteral: - case S_litteral: - /*PSRC:this case not understood completely by HV, not done */ - HTChunkPutc(string, c); -#ifdef USE_PRETTYSRC - if (psrc_view) { - /* there is nothing useful in the element_stack */ - testtag = context->current_tag; - } else -#endif - testtag = (context->element_stack - ? context->element_stack->tag - : NULL); - - if (testtag == NULL || testtag->name == NULL) { - string->size--; - context->state = S_text; - goto top1; - } - - /* - * Normally when we get the closing ">", - * testtag contains something like "TITLE" - * string contains something like "/title>" - * so we decrement by 2 to compare the final character of each. - */ - testlast = string->size - 2 - context->trailing_spaces - context->leading_spaces; - - if (TOUPPER(c) != ((testlast < 0) - ? '/' - : testtag->name[testlast])) { - int i; - - /* - * If complete match, end litteral. - */ - if ((c == '>') && - testlast >= 0 && !testtag->name[testlast]) { -#ifdef USE_PRETTYSRC - if (psrc_view) { - char *trailing = NULL; - - if (context->trailing_spaces) { - StrAllocCopy(trailing, - string->data - + string->size - - 1 - - context->trailing_spaces); - trailing[context->trailing_spaces] = '\0'; - } - - PSRCSTART(abracket); - PUTS("</"); - PSRCSTOP(abracket); - PSRCSTART(tag); - - strcpy(string->data, context->current_tag->name); - transform_tag(context, string); - PUTS(string->data); - - if (trailing) { - PUTS(trailing); - FREE(trailing); - } - - PSRCSTOP(tag); - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - - context->current_tag = NULL; - } else -#endif - end_element(context, context->element_stack->tag); - - string->size = 0; - context->current_attribute_number = INVALID; - context->state = S_text; - context->leading_spaces = 0; - context->trailing_spaces = 0; - break; - } - - /* - * Allow whitespace between the "<" or ">" and the keyword, for - * error-recovery. - */ - if (isspace(UCH(c))) { - if (testlast == -1) { - context->leading_spaces += 1; - CTRACE2(TRACE_SGML, (tfp, "leading spaces: %d\n", context->leading_spaces)); - break; - } else if (testlast > 0) { - context->trailing_spaces += 1; - CTRACE2(TRACE_SGML, (tfp, "trailing spaces: %d\n", context->trailing_spaces)); - break; - } - } - - /* - * Mismatch - recover. - */ - context->leading_spaces = 0; - context->trailing_spaces = 0; - if (((testtag->contents != SGML_LITTERAL && - (testtag->flags & Tgf_strict)) || - (context->state == S_pcdata && - (testtag->flags & (Tgf_strict | Tgf_endO)))) && - (testlast > -1 && - (c == '>' || testlast > 0 || IsNmStart(c)))) { - context->state = S_end; - string->size--; - for (i = 0; i < string->size; i++) /* remove '/' */ - string->data[i] = string->data[i + 1]; - if ((string->size == 1) ? IsNmStart(c) : IsNmChar(c)) - break; - string->size--; - goto top1; - } - if (context->state == S_pcdata && - (testtag->flags & (Tgf_strict | Tgf_endO)) && - (testlast < 0 && IsNmStart(c))) { - context->state = S_tag; - break; - } - /* - * If Mismatch: recover string literally. - */ - PUTC('<'); - for (i = 0; i < string->size - 1; i++) /* recover, except last c */ - PUTC(string->data[i]); - string->size = 0; - context->state = S_text; - goto top1; /* to recover last c */ - } - break; - - /* - * Character reference (numeric entity) or named entity. - */ - case S_ero: - if (c == '#') { - /* - * Setting up for possible numeric entity. - */ - context->state = S_cro; /* &# is Char Ref Open */ - break; - } - context->state = S_entity; /* Fall through! */ - - /* - * Handle possible named entity. - */ - case S_entity: - if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1029 */ - isalnum(UCH(c)) : isalpha(UCH(c)))) { - /* Should probably use IsNmStart/IsNmChar above (is that right?), - but the world is not ready for that - there's  : (note - colon!) and stuff around. */ - /* - * Accept valid ASCII character. - FM - */ - HTChunkPutc(string, c); - } else if (string->size == 0) { - /* - * It was an ampersand that's just text, so output the ampersand - * and recycle this character. - FM - */ -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTART(badseq); -#endif - PUTC('&'); -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTOP(badseq); -#endif - context->state = S_text; - goto top1; - } else { - /* - * Terminate entity name and try to handle it. - FM - */ - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - entity_string = string->data; -#endif - /* S/390 -- gil -- 1039 */ - /* CTRACE((tfp, "%s: %d: %s\n", __FILE__, __LINE__, string->data)); */ - if (!strcmp(string->data, "zwnj") && - (!context->element_stack || - (context->element_stack->tag && - context->element_stack->tag->contents == SGML_MIXED))) { - /* - * Handle zwnj (8204) as <WBR>. - FM - */ - char temp[8]; - - CTRACE((tfp, - "SGML_character: Handling 'zwnj' entity as 'WBR' element.\n")); - - if (c != ';') { - sprintf(temp, "<WBR>%c", c); - } else { - sprintf(temp, "<WBR>"); - } - if (context->recover == NULL) { - StrAllocCopy(context->recover, temp); - context->recover_index = 0; - } else { - StrAllocCat(context->recover, temp); - } - string->size = 0; - context->state = S_text; - break; - } else { - handle_entity(context, '\0'); - } - string->size = 0; - context->state = S_text; - /* - * Don't eat the terminator if we didn't find the entity name and - * therefore sent the raw string via handle_entity(), or if the - * terminator is not the "standard" semi-colon for HTML. - FM - */ -#ifdef USE_PRETTYSRC - if (psrc_view && FoundEntity && c == ';') { - PSRCSTART(entity); - PUTC(c); - PSRCSTOP(entity); - } -#endif - if (!FoundEntity || c != ';') - goto top1; - } - break; - - /* - * Check for a numeric entity. - */ - case S_cro: - if (TOASCII(unsign_c) < 127 && TOLOWER(UCH(c)) == 'x') { /* S/390 -- gil -- 1060 */ - context->isHex = TRUE; - context->state = S_incro; - } else if (TOASCII(unsign_c) < 127 && isdigit(UCH(c))) { - /* - * Accept only valid ASCII digits. - FM - */ - HTChunkPutc(string, c); /* accumulate a character NUMBER */ - context->isHex = FALSE; - context->state = S_incro; - } else if (string->size == 0) { - /* - * No 'x' or digit following the "&#" so recover them and recycle - * the character. - FM - */ -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTART(badseq); -#endif - PUTC('&'); - PUTC('#'); -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTOP(badseq); -#endif - context->state = S_text; - goto top1; - } - break; - - /* - * Handle a numeric entity. - */ - case S_incro: - /* S/390 -- gil -- 1075 */ - if ((TOASCII(unsign_c) < 127) && - (context->isHex - ? isxdigit(UCH(c)) - : isdigit(UCH(c)))) { - /* - * Accept only valid hex or ASCII digits. - FM - */ - HTChunkPutc(string, c); /* accumulate a character NUMBER */ - } else if (string->size == 0) { - /* - * No hex digit following the "&#x" so recover them and recycle the - * character. - FM - */ -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTART(badseq); -#endif - PUTS("&#x"); -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTOP(badseq); -#endif - context->isHex = FALSE; - context->state = S_text; - goto top1; - } else { - /* - * Terminate the numeric entity and try to handle it. - FM - */ - unsigned long code; - int i; - - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - entity_string = string->data; -#endif - if ((context->isHex ? sscanf(string->data, "%lx", &code) : - sscanf(string->data, "%lu", &code)) == 1) { -/* =============== work in ASCII below here =============== S/390 -- gil -- 1092 */ - code = LYcp1252ToUnicode(code); - /* - * Check for special values. - FM - */ - if ((code == 8204) && - (!context->element_stack || - (context->element_stack->tag && - context->element_stack->tag->contents == SGML_MIXED))) { - /* - * Handle zwnj (8204) as <WBR>. - FM - */ - char temp[8]; - - CTRACE((tfp, - "SGML_character: Handling '8204' (zwnj) reference as 'WBR' element.\n")); - - /* - * Include the terminator if it is not the standard - * semi-colon. - FM - */ - if (c != ';') { - sprintf(temp, "<WBR>%c", c); - } else { - sprintf(temp, "<WBR>"); - } - /* - * Add the replacement string to the recover buffer for - * processing. - FM - */ - if (context->recover == NULL) { - StrAllocCopy(context->recover, temp); - context->recover_index = 0; - } else { - StrAllocCat(context->recover, temp); - } - string->size = 0; - context->isHex = FALSE; - context->state = S_text; - break; - } else if (put_special_unicodes(context, code)) { - /* - * We handled the value as a special character, so recycle - * the terminator or break. - FM - */ -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(entity); - PUTS((context->isHex ? "&#x" : "&#")); - PUTS(entity_string); - if (c == ';') - PUTC(';'); - PSRCSTOP(entity); - } -#endif - string->size = 0; - context->isHex = FALSE; - context->state = S_text; - if (c != ';') - goto top1; - break; - } - /* - * Seek a translation from the chartrans tables. - */ - if ((uck = UCTransUniChar(code, - context->outUCLYhndl)) >= 32 && - uck < 256 && - (uck < 127 || - uck >= LYlowest_eightbit[context->outUCLYhndl])) { -#ifdef USE_PRETTYSRC - if (!psrc_view) { -#endif - PUTC(FROMASCII((char) uck)); -#ifdef USE_PRETTYSRC - } else { - put_pretty_number(context); - } -#endif - } else if ((uck == -4 || - (context->T.repl_translated_C0 && - uck > 0 && uck < 32)) && - /* - * Not found; look for replacement string. - */ - (uck = UCTransUniCharStr(replace_buf, 60, code, - context->outUCLYhndl, - 0) >= 0)) { -#ifdef USE_PRETTYSRC - if (psrc_view) { - put_pretty_number(context); - } else -#endif - PUTS(replace_buf); - /* - * If we're displaying UTF-8, try that now. - FM - */ - } else if (context->T.output_utf8 && PUTUTF8(code)) { - ; /* do nothing more */ - /* - * Ignore 8205 (zwj), 8206 (lrm), and 8207 (rln), if we get - * to here. - FM - */ - } else if (code == 8205 || - code == 8206 || - code == 8207) { - if (TRACE) { - string->size--; - LYstrncpy(replace_buf, - string->data, - (string->size < 64 ? string->size : 63)); - fprintf(tfp, - "SGML_character: Ignoring '%s%s'.\n", - (context->isHex ? "&#x" : "&#"), - replace_buf); - } -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(badseq); - PUTS((context->isHex ? "&#x" : "&#")); - PUTS(entity_string); - if (c == ';') - PUTC(';'); - PSRCSTOP(badseq); - } -#endif - string->size = 0; - context->isHex = FALSE; - context->state = S_text; - if (c != ';') - goto top1; - break; - /* - * Show the numeric entity if we get to here and the value: - * (1) Is greater than 255 (but use ASCII characters for - * spaces or dashes). - * (2) Is less than 32, and not valid or we don't have - * HTCJK set. - * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK - * set. - * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum - * set. - * - FM - */ - } else if ((code > 255) || - (code < ' ' && /* S/390 -- gil -- 1140 */ - code != '\t' && code != '\n' && code != '\r' && - !IS_CJK_TTY) || - (TOASCII(code) == 127 && - !(HTPassHighCtrlRaw || IS_CJK_TTY)) || - (TOASCII(code) > 127 && code < 160 && - !HTPassHighCtrlNum)) { - /* - * Unhandled or illegal value. Recover the "&#" or "&#x" - * and digit(s), and recycle the terminator. - FM - */ -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(badseq); - } -#endif - if (context->isHex) { - PUTS("&#x"); - context->isHex = FALSE; - } else { - PUTS("&#"); - } - string->size--; - for (i = 0; i < string->size; i++) /* recover */ - PUTC(string->data[i]); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTOP(badseq); - } -#endif - string->size = 0; - context->isHex = FALSE; - context->state = S_text; - goto top1; - } else if (TOASCII(code) < 161 || /* S/390 -- gil -- 1162 */ - HTPassEightBitNum || - IncludesLatin1Enc) { - /* - * No conversion needed. - FM - */ -#ifdef USE_PRETTYSRC - if (psrc_view) { - put_pretty_number(context); - } else -#endif - PUTC(FROMASCII((char) code)); - } else { - /* - * Handle as named entity. - FM - */ - code -= 160; - EntityName = HTMLGetEntityName(code); - if (EntityName && EntityName[0] != '\0') { - string->size = 0; - HTChunkPuts(string, EntityName); - HTChunkTerminate(string); - handle_entity(context, '\0'); - /* - * Add a semi-colon if something went wrong and - * handle_entity() sent the string. - FM - */ - if (!FoundEntity) { - PUTC(';'); - } - } else { - /* - * Our conversion failed, so recover the "&#" and - * digit(s), and recycle the terminator. - FM - */ -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTART(badseq); -#endif - if (context->isHex) { - PUTS("&#x"); - context->isHex = FALSE; - } else { - PUTS("&#"); - } - string->size--; - for (i = 0; i < string->size; i++) /* recover */ - PUTC(string->data[i]); -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTOP(badseq); -#endif - string->size = 0; - context->isHex = FALSE; - context->state = S_text; - goto top1; - } - } - /* - * If we get to here, we succeeded. Hoorah!!! - FM - */ - string->size = 0; - context->isHex = FALSE; - context->state = S_text; - /* - * Don't eat the terminator if it's not the "standard" - * semi-colon for HTML. - FM - */ - if (c != ';') { - goto top1; - } - } else { - /* - * Not an entity, and don't know why not, so add the terminator - * to the string, output the "&#" or "&#x", and process the - * string via the recover element. - FM - */ - string->size--; - HTChunkPutc(string, c); - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTART(badseq); -#endif - if (context->isHex) { - PUTS("&#x"); - context->isHex = FALSE; - } else { - PUTS("&#"); - } -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTOP(badseq); -#endif - if (context->recover == NULL) { - StrAllocCopy(context->recover, string->data); - context->recover_index = 0; - } else { - StrAllocCat(context->recover, string->data); - } - string->size = 0; - context->isHex = FALSE; - context->state = S_text; - break; - } - } - break; - - /* - * Tag - */ - case S_tag: /* new tag */ - if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1179 */ - IsNmChar(c) : IsNmStart(c))) { - /* - * Add valid ASCII character. - FM - */ - HTChunkPutc(string, c); - } else if (c == '!' && !string->size) { /* <! */ - /* - * Terminate and set up for possible comment, identifier, - * declaration, or marked section. - FM - */ - context->state = S_exclamation; - context->lead_exclamation = TRUE; - context->doctype_bracket = FALSE; - context->first_bracket = FALSE; - HTChunkPutc(string, c); - break; - } else if (!string->size && - (TOASCII(unsign_c) <= 160 && /* S/390 -- gil -- 1196 */ - (c != '/' && c != '?' && c != '_' && c != ':'))) { - /* - * '<' must be followed by an ASCII letter to be a valid start tag. - * Here it isn't, nor do we have a '/' for an end tag, nor one of - * some other characters with a special meaning for SGML or which - * are likely to be legal Name Start characters in XML or some - * other extension. So recover the '<' and following character as - * data. - FM & KW - */ - context->state = S_text; -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTART(badseq); -#endif - PUTC('<'); -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTOP(badseq); -#endif - goto top1; - } else { /* End of tag name */ - /* - * Try to handle tag. - FM - */ - HTTag *t; - - if (c == '/') { - if (string->size == 0) { - context->state = S_end; - break; - } - CTRACE((tfp, "SGML: `<%.*s/' found!\n", string->size, string->data)); - } - HTChunkTerminate(string); - - t = SGMLFindTag(dtd, string->data); - if (t == context->unknown_tag && - ((c == ':' && - string->size == 4 && 0 == strcasecomp(string->data, "URL")) || - (string->size > 4 && 0 == strncasecomp(string->data, "URL:", 4)))) { - /* - * Treat <URL: as text rather than a junk tag, so we display - * it and the URL (Lynxism 8-). - FM - */ -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTART(badseq); -#endif - PUTC('<'); - PUTS(string->data); /* recover */ - PUTC(c); -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTOP(badseq); -#endif - CTRACE((tfp, "SGML: Treating <%s%c as text\n", - string->data, c)); - string->size = 0; - context->state = S_text; - break; - } - if (c == '/' && t) { - /* - * Element name was ended by '/'. Remember the tag that ended - * thusly, we'll interpret this as either an indication of an - * empty element (if '>' follows directly) or do some - * SGMLshortref-ish treatment. - kw - */ - context->slashedtag = t; - } - if (!t) { - if (c == '?' && string->size <= 1) { - CTRACE((tfp, "SGML: Found PI, looking for '>'\n")); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(abracket); - PUTS("<?"); - PSRCSTOP(abracket); - } -#endif - string->size = 0; - context->state = S_pi; - HTChunkPutc(string, c); - break; - } - CTRACE((tfp, "SGML: *** Invalid element %s\n", - string->data)); - -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(abracket); - PUTC('<'); - PSRCSTOP(abracket); - PSRCSTART(badtag); - transform_tag(context, string); - PUTS(string->data); - if (c == '>') { - PSRCSTOP(badtag); - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - } else { - PUTC(c); - } - } -#endif - context->state = (c == '>') ? S_text : S_junk_tag; - break; - } else if (t == context->unknown_tag) { - CTRACE((tfp, "SGML: *** Unknown element \"%s\"\n", - string->data)); - /* - * Fall through and treat like valid tag for attribute parsing. - * - KW - */ - - } - context->current_tag = t; - -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(abracket); - PUTC('<'); - PSRCSTOP(abracket); - if (t != context->unknown_tag) - PSRCSTART(tag); - else - PSRCSTART(badtag); - transform_tag(context, string); - PUTS(string->data); - if (t != context->unknown_tag) - PSRCSTOP(tag); - else - PSRCSTOP(badtag); - } - if (!psrc_view) /*don't waste time */ -#endif - { - /* - * Clear out attributes. - */ - memset((void *) context->present, 0, sizeof(BOOL) * - context->current_tag->number_of_attributes); - } - - string->size = 0; - context->current_attribute_number = INVALID; -#ifdef USE_PRETTYSRC - if (psrc_view) { - if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) { - if (c != '<') { - PSRCSTART(abracket); - PUTC(c); - PSRCSTOP(abracket); - context->state = (c == '>') ? S_text : S_tagname_slash; - } else { - context->state = S_tag; - } - } else { - if (!WHITE(c)) - PUTC(c); - context->state = S_tag_gap; - } - } else -#endif - if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) { - if (context->current_tag->name) - start_element(context); - context->state = (c == '>') ? S_text : - (c == '<') ? S_tag : S_tagname_slash; - } else { - context->state = S_tag_gap; - } - } - break; - - case S_exclamation: - if (context->lead_exclamation && c == '-') { - /* - * Set up for possible comment. - FM - */ - context->lead_exclamation = FALSE; - context->first_dash = TRUE; - HTChunkPutc(string, c); - break; - } - if (context->lead_exclamation && c == '[') { - /* - * Set up for possible marked section. - FM - */ - context->lead_exclamation = FALSE; - context->first_bracket = TRUE; - context->second_bracket = FALSE; - HTChunkPutc(string, c); - context->state = S_marked; - break; - } - if (context->first_dash && c == '-') { - /* - * Set up to handle comment. - FM - */ - context->lead_exclamation = FALSE; - context->first_dash = FALSE; - context->end_comment = FALSE; - HTChunkPutc(string, c); - context->state = S_comment; - break; - } - context->lead_exclamation = FALSE; - context->first_dash = FALSE; - if (c == '>') { - /* - * Try to handle identifier. - FM - */ - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(sgmlspecial); - PUTC('<'); - PUTS(string->data); - PUTC('>'); - PSRCSTOP(sgmlspecial); - } else -#endif - handle_identifier(context); - string->size = 0; - context->state = S_text; - break; - } - if (WHITE(c)) { - if (string->size == 8 && - !strncasecomp(string->data, "!DOCTYPE", 8)) { - /* - * Set up for DOCTYPE declaration. - FM - */ - HTChunkPutc(string, c); - context->doctype_bracket = FALSE; - context->state = S_doctype; - break; - } - if (string->size == 7 && - !strncasecomp(string->data, "!ENTITY", 7)) { - /* - * Set up for ENTITY declaration. - FM - */ - HTChunkPutc(string, c); - context->first_dash = FALSE; - context->end_comment = TRUE; - context->state = S_sgmlent; - break; - } - if (string->size == 8 && - !strncasecomp(string->data, "!ELEMENT", 8)) { - /* - * Set up for ELEMENT declaration. - FM - */ - HTChunkPutc(string, c); - context->first_dash = FALSE; - context->end_comment = TRUE; - context->state = S_sgmlele; - break; - } - if (string->size == 8 && - !strncasecomp(string->data, "!ATTLIST", 8)) { - /* - * Set up for ATTLIST declaration. - FM - */ - HTChunkPutc(string, c); - context->first_dash = FALSE; - context->end_comment = TRUE; - context->state = S_sgmlatt; - break; - } - } - HTChunkPutc(string, c); - break; - - case S_comment: /* Expecting comment. - FM */ - if (historical_comments) { - /* - * Any '>' terminates. - FM - */ - if (c == '>') { - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(comm); - PUTC('<'); - PUTS_TR(string->data); - PUTC('>'); - PSRCSTOP(comm); - } else -#endif - handle_comment(context); - string->size = 0; - context->end_comment = FALSE; - context->first_dash = FALSE; - context->state = S_text; - break; - } - goto S_comment_put_c; - } - if (!context->first_dash && c == '-') { - HTChunkPutc(string, c); - context->first_dash = TRUE; - break; - } - if (context->first_dash && c == '-') { - HTChunkPutc(string, c); - context->first_dash = FALSE; - if (!context->end_comment) - context->end_comment = TRUE; - else if (!minimal_comments) - /* - * Validly treat '--' pairs as successive comments (for - * minimal, any "--WHITE>" terminates). - FM - */ - context->end_comment = FALSE; - break; - } - if (context->end_comment && c == '>') { - /* - * Terminate and handle the comment. - FM - */ - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(comm); - PUTC('<'); - PUTS_TR(string->data); - PUTC('>'); - PSRCSTOP(comm); - } else -#endif - handle_comment(context); - string->size = 0; - context->end_comment = FALSE; - context->first_dash = FALSE; - context->state = S_text; - break; - } - context->first_dash = FALSE; - if (context->end_comment && !isspace(UCH(c))) - context->end_comment = FALSE; - - S_comment_put_c: - if (context->T.decode_utf8 && - *context->utf_buf) { - HTChunkPuts(string, context->utf_buf); - context->utf_buf_p = context->utf_buf; - *(context->utf_buf_p) = '\0'; - } else if (!IS_CJK_TTY && - (context->T.output_utf8 || - context->T.trans_from_uni)) { - if (clong == 0xfffd && saved_char_in && - HTPassEightBitRaw && - UCH(saved_char_in) >= - LYlowest_eightbit[context->outUCLYhndl]) { - HTChunkPutUtf8Char(string, - (0xf000 | UCH(saved_char_in))); - } else { - HTChunkPutUtf8Char(string, clong); - } - } else if (saved_char_in && context->T.use_raw_char_in) { - HTChunkPutc(string, saved_char_in); - } else { - HTChunkPutc(string, c); - } - break; - - case S_doctype: /* Expecting DOCTYPE. - FM */ - if (context->doctype_bracket) { - HTChunkPutc(string, c); - if (c == ']') - context->doctype_bracket = FALSE; - break; - } - if (c == '[' && WHITE(string->data[string->size - 1])) { - HTChunkPutc(string, c); - context->doctype_bracket = TRUE; - break; - } - if (c == '>') { - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(sgmlspecial); - PUTC('<'); - PUTS(string->data); - PUTC('>'); - PSRCSTOP(sgmlspecial); - } else -#endif - handle_doctype(context); - string->size = 0; - context->state = S_text; - break; - } - HTChunkPutc(string, c); - break; - - case S_marked: /* Expecting marked section. - FM */ - if (context->first_bracket && c == '[') { - HTChunkPutc(string, c); - context->first_bracket = FALSE; - context->second_bracket = TRUE; - break; - } - if (context->second_bracket && c == ']' && - string->data[string->size - 1] == ']') { - HTChunkPutc(string, c); - context->second_bracket = FALSE; - break; - } - if (!context->second_bracket && c == '>') { - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(sgmlspecial); - PUTC('<'); - PUTS(string->data); - PUTC('>'); - PSRCSTOP(sgmlspecial); - } else -#endif - handle_marked(context); - string->size = 0; - context->state = S_text; - break; - } - HTChunkPutc(string, c); - break; - - case S_sgmlent: /* Expecting ENTITY. - FM */ - if (!context->first_dash && c == '-') { - HTChunkPutc(string, c); - context->first_dash = TRUE; - break; - } - if (context->first_dash && c == '-') { - HTChunkPutc(string, c); - context->first_dash = FALSE; - if (!context->end_comment) - context->end_comment = TRUE; - else - context->end_comment = FALSE; - break; - } - if (context->end_comment && c == '>') { - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(sgmlspecial); - PUTC('<'); - PUTS(string->data); - PUTC('>'); - PSRCSTOP(sgmlspecial); - } else -#endif - handle_sgmlent(context); - string->size = 0; - context->end_comment = FALSE; - context->first_dash = FALSE; - context->state = S_text; - break; - } - context->first_dash = FALSE; - HTChunkPutc(string, c); - break; - - case S_sgmlele: /* Expecting ELEMENT. - FM */ - if (!context->first_dash && c == '-') { - HTChunkPutc(string, c); - context->first_dash = TRUE; - break; - } - if (context->first_dash && c == '-') { - HTChunkPutc(string, c); - context->first_dash = FALSE; - if (!context->end_comment) - context->end_comment = TRUE; - else - context->end_comment = FALSE; - break; - } - if (context->end_comment && c == '>') { - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(sgmlspecial); - PUTC('<'); - PUTS(string->data); - PUTC('>'); - PSRCSTOP(sgmlspecial); - } else -#endif - handle_sgmlele(context); - string->size = 0; - context->end_comment = FALSE; - context->first_dash = FALSE; - context->state = S_text; - break; - } - context->first_dash = FALSE; - HTChunkPutc(string, c); - break; - - case S_sgmlatt: /* Expecting ATTLIST. - FM */ - if (!context->first_dash && c == '-') { - HTChunkPutc(string, c); - context->first_dash = TRUE; - break; - } - if (context->first_dash && c == '-') { - HTChunkPutc(string, c); - context->first_dash = FALSE; - if (!context->end_comment) - context->end_comment = TRUE; - else - context->end_comment = FALSE; - break; - } - if (context->end_comment && c == '>') { - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(sgmlspecial); - PUTC('<'); - PUTS(string->data); - PUTC('>'); - PSRCSTOP(sgmlspecial); - } else -#endif - handle_sgmlatt(context); - string->size = 0; - context->end_comment = FALSE; - context->first_dash = FALSE; - context->state = S_text; - break; - } - context->first_dash = FALSE; - HTChunkPutc(string, c); - break; - - case S_tag_gap: /* Expecting attribute or '>' */ - if (WHITE(c)) { - /* PUTC(c); - no, done as special case */ - break; /* Gap between attributes */ - } - if (c == '>') { /* End of tag */ -#ifdef USE_PRETTYSRC - if (!psrc_view) -#endif - if (context->current_tag->name) - start_element(context); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - } -#endif - context->state = S_text; - break; - } - HTChunkPutc(string, c); - context->state = S_attr; /* Get attribute */ - break; - - /* accumulating value */ - case S_attr: - if (WHITE(c) || (c == '>') || (c == '=')) { /* End of word */ - if ((c == '>') - && (string->size == 1) - && (string->data[0] == '/')) { - if (context->extended_html - && ignore_when_empty(context->current_tag)) { - discard_empty(context); - } - } else { - HTChunkTerminate(string); - handle_attribute_name(context, string->data); - } -#ifdef USE_PRETTYSRC - if (!psrc_view) { -#endif - string->size = 0; - if (c == '>') { /* End of tag */ - if (context->current_tag->name) - start_element(context); - context->state = S_text; - break; - } -#ifdef USE_PRETTYSRC - } else { - PUTC(' '); - if (context->current_attribute_number == INVALID) - PSRCSTART(badattr); - else - PSRCSTART(attrib); - if (attrname_transform != 1) { - if (attrname_transform == 0) - LYLowerCase(string->data); - else - LYUpperCase(string->data); - } - PUTS(string->data); - if (c == '=' || WHITE(c)) - PUTC(c); - if (c == '=' || c == '>') { - if (context->current_attribute_number == INVALID) { - PSRCSTOP(badattr); - } else { - PSRCSTOP(attrib); - } - } - if (c == '>') { - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - context->state = S_text; - break; - } - string->size = 0; - } -#endif - context->state = (c == '=' ? S_equals : S_attr_gap); - } else { - HTChunkPutc(string, c); - } - break; - - case S_attr_gap: /* Expecting attribute or '=' or '>' */ - if (WHITE(c)) { - PRETTYSRC_PUTC(c); - break; /* Gap after attribute */ - } - if (c == '>') { /* End of tag */ -#ifdef USE_PRETTYSRC - if (psrc_view) { - if (context->current_attribute_number == INVALID) { - PSRCSTOP(badattr); - } else { - PSRCSTOP(attrib); - } - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - } else -#endif - if (context->current_tag->name) - start_element(context); - context->state = S_text; - break; - } else if (c == '=') { -#ifdef USE_PRETTYSRC - if (psrc_view) { - PUTC('='); - if (context->current_attribute_number == INVALID) { - PSRCSTOP(badattr); - } else { - PSRCSTOP(attrib); - } - } -#endif - context->state = S_equals; - break; - } - HTChunkPutc(string, c); - context->state = S_attr; /* Get next attribute */ - break; - - case S_equals: /* After attr = */ - if (WHITE(c)) { - PRETTYSRC_PUTC(c); - break; /* Before attribute value */ - } - if (c == '>') { /* End of tag */ - CTRACE((tfp, "SGML: found = but no value\n")); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - } else -#endif - if (context->current_tag->name) - start_element(context); - context->state = S_text; - break; - - } else if (c == '\'') { -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(attrval); - PUTC(c); - } -#endif - context->state = S_squoted; - break; - - } else if (c == '"') { -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(attrval); - PUTC(c); - } -#endif - context->state = S_dquoted; - break; - } -#ifdef USE_PRETTYSRC - if (psrc_view) - PSRCSTART(attrval); -#endif - context->state = S_value; - /* no break! fall through to S_value and process current `c` */ - - case S_value: - if (WHITE(c) || (c == '>')) { /* End of word */ - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (!end_if_prettysrc(context, string, 0)) -#endif - { -#ifdef CJK_EX /* Quick hack. - JH7AYN */ - if (IS_CJK_TTY) { - if (string->data[0] == '$') { - if (string->data[1] == 'B' || string->data[1] == '@') { - char *jis_buf = 0; - - HTSprintf0(&jis_buf, "\033%s", string->data); - TO_EUC((const unsigned char *) jis_buf, - (unsigned char *) string->data); - FREE(jis_buf); - } - } - } -#endif - handle_attribute_value(context, string->data); - } - string->size = 0; - if (c == '>') { /* End of tag */ -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - } else -#endif - if (context->current_tag->name) - start_element(context); - context->state = S_text; - break; - } else - context->state = S_tag_gap; - } else if (context->T.decode_utf8 && - *context->utf_buf) { - HTChunkPuts(string, context->utf_buf); - context->utf_buf_p = context->utf_buf; - *(context->utf_buf_p) = '\0'; - } else if (!IS_CJK_TTY && - (context->T.output_utf8 || - context->T.trans_from_uni)) { - if (clong == 0xfffd && saved_char_in && - HTPassEightBitRaw && - UCH(saved_char_in) >= - LYlowest_eightbit[context->outUCLYhndl]) { - HTChunkPutUtf8Char(string, - (0xf000 | UCH(saved_char_in))); - } else { - HTChunkPutUtf8Char(string, clong); - } - } else if (saved_char_in && context->T.use_raw_char_in) { - HTChunkPutc(string, saved_char_in); - } else { - HTChunkPutc(string, c); - } - break; - - case S_squoted: /* Quoted attribute value */ - if (c == '\'') { /* End of attribute value */ - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (!end_if_prettysrc(context, string, '\'')) -#endif - handle_attribute_value(context, string->data); - string->size = 0; - context->state = S_tag_gap; - } else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1213 */ - /* - * Setting up for possible single quotes in CJK escape sequences. - * - Takuya ASADA (asada@three-a.co.jp) - */ - context->state = S_esc_sq; - HTChunkPutc(string, c); - } else if (context->T.decode_utf8 && - *context->utf_buf) { - HTChunkPuts(string, context->utf_buf); - context->utf_buf_p = context->utf_buf; - *(context->utf_buf_p) = '\0'; - } else if (!IS_CJK_TTY && - (context->T.output_utf8 || - context->T.trans_from_uni)) { - if (clong == 0xfffd && saved_char_in && - HTPassEightBitRaw && - UCH(saved_char_in) >= - LYlowest_eightbit[context->outUCLYhndl]) { - HTChunkPutUtf8Char(string, - (0xf000 | UCH(saved_char_in))); - } else { - HTChunkPutUtf8Char(string, clong); - } - } else if (saved_char_in && context->T.use_raw_char_in) { - HTChunkPutc(string, saved_char_in); - } else { - HTChunkPutc(string, c); - } - break; - - case S_dquoted: /* Quoted attribute value */ - if (c == '"' || /* Valid end of attribute value */ - (soft_dquotes && /* If emulating old Netscape bug, treat '>' */ - c == '>')) { /* as a co-terminator of dquoted and tag */ - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (!end_if_prettysrc(context, string, c)) -#endif - handle_attribute_value(context, string->data); - string->size = 0; - context->state = S_tag_gap; - if (c == '>') /* We emulated the Netscape bug, so we go */ - goto top1; /* back and treat it as the tag terminator */ - } else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1230 */ - /* - * Setting up for possible double quotes in CJK escape sequences. - * - Takuya ASADA (asada@three-a.co.jp) - */ - context->state = S_esc_dq; - HTChunkPutc(string, c); - } else if (context->T.decode_utf8 && - *context->utf_buf) { - HTChunkPuts(string, context->utf_buf); - context->utf_buf_p = context->utf_buf; - *(context->utf_buf_p) = '\0'; - } else if (!IS_CJK_TTY && - (context->T.output_utf8 || - context->T.trans_from_uni)) { - if (clong == 0xfffd && saved_char_in && - HTPassEightBitRaw && - UCH(saved_char_in) >= - LYlowest_eightbit[context->outUCLYhndl]) { - HTChunkPutUtf8Char(string, - (0xf000 | UCH(saved_char_in))); - } else { - HTChunkPutUtf8Char(string, clong); - } - } else if (saved_char_in && context->T.use_raw_char_in) { - HTChunkPutc(string, saved_char_in); - } else { - HTChunkPutc(string, c); - } - break; - - case S_end: /* </ */ - if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1247 */ - IsNmChar(c) : IsNmStart(c))) { - HTChunkPutc(string, c); - } else { /* End of end tag name */ - HTTag *t = 0; - -#ifdef USE_PRETTYSRC - BOOL psrc_tagname_processed = FALSE; -#endif - - HTChunkTerminate(string); - if (!*string->data) { /* Empty end tag */ - if (context->element_stack) - t = context->element_stack->tag; - } else { - t = SGMLFindTag(dtd, string->data); - } - if (!t || t == context->unknown_tag) { - CTRACE((tfp, "Unknown end tag </%s>\n", string->data)); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PSRCSTART(abracket); - PUTS("</"); - PSRCSTOP(abracket); - PSRCSTART(badtag); - transform_tag(context, string); - PUTS(string->data); - if (c != '>') { - PUTC(c); - } else { - PSRCSTOP(badtag); - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - } - psrc_tagname_processed = TRUE; - } - } else if (psrc_view) { -#endif - } else { - BOOL tag_OK = (BOOL) (c == '>' || WHITE(c)); - HTMLElement e = TAGNUM_OF_TAGP(t); - int branch = 2; /* it can be 0,1,2 */ - - context->current_tag = t; - if (HAS_ALT_TAGNUM(TAGNUM_OF_TAGP(t)) && - context->element_stack && - ALT_TAGP(t) == context->element_stack->tag) - context->element_stack->tag = NORMAL_TAGP(context->element_stack->tag); - - if (tag_OK && Old_DTD) { - switch (e) { - case HTML_DD: - case HTML_DT: - case HTML_LI: - case HTML_LH: - case HTML_TD: - case HTML_TH: - case HTML_TR: - case HTML_THEAD: - case HTML_TFOOT: - case HTML_TBODY: - case HTML_COLGROUP: - branch = 0; - break; - - case HTML_A: - case HTML_B: - case HTML_BLINK: - case HTML_CITE: - case HTML_EM: - case HTML_FONT: - case HTML_FORM: - case HTML_I: - case HTML_P: - case HTML_STRONG: - case HTML_TT: - case HTML_U: - branch = 1; - break; - default: - break; - } - } - - /* - * Just handle ALL end tags normally :-) - kw - */ - if (!Old_DTD) { - end_element(context, context->current_tag); - } else if (tag_OK && (branch == 0)) { - /* - * Don't treat these end tags as invalid, nor act on them. - * - FM - */ - CTRACE((tfp, "SGML: `</%s%c' found! Ignoring it.\n", - string->data, c)); - string->size = 0; - context->current_attribute_number = INVALID; - if (c != '>') { - context->state = S_junk_tag; - } else { - context->current_tag = NULL; - context->state = S_text; - } - break; - } else if (tag_OK && (branch == 1)) { - /* - * Handle end tags for container elements declared as - * SGML_EMPTY to prevent "expected tag substitution" but - * still processed via HTML_end_element() in HTML.c with - * checks there to avoid throwing the HTML.c stack out of - * whack (Ugh, what a hack! 8-). - FM - */ - if (context->inSELECT) { - /* - * We are in a SELECT block. - FM - */ - if (strcasecomp(string->data, "FORM")) { - /* - * It is not at FORM end tag, so ignore it. - FM - */ - CTRACE((tfp, - "SGML: ***Ignoring end tag </%s> in SELECT block.\n", - string->data)); - } else { - /* - * End the SELECT block and then handle the FORM - * end tag. - FM - */ - CTRACE((tfp, - "SGML: ***Faking SELECT end tag before </%s> end tag.\n", - string->data)); - end_element(context, - SGMLFindTag(context->dtd, "SELECT")); - CTRACE((tfp, "SGML: End </%s>\n", string->data)); - -#ifdef USE_PRETTYSRC - if (!psrc_view) /* Don't actually call if viewing psrc - kw */ -#endif - (*context->actions->end_element) - (context->target, - TAGNUM_OF_TAGP(context->current_tag), - &context->include); - } - } else if (!strcasecomp(string->data, "P")) { - /* - * Treat a P end tag like a P start tag (Ugh, what a - * hack! 8-). - FM - */ - CTRACE((tfp, - "SGML: `</%s%c' found! Treating as '<%s%c'.\n", - string->data, c, string->data, c)); - { - int i; - - for (i = 0; - i < context->current_tag->number_of_attributes; - i++) { - context->present[i] = NO; - } - } - if (context->current_tag->name) - start_element(context); - } else { - CTRACE((tfp, "SGML: End </%s>\n", string->data)); - -#ifdef USE_PRETTYSRC - if (!psrc_view) /* Don't actually call if viewing psrc - kw */ -#endif - (*context->actions->end_element) - (context->target, - TAGNUM_OF_TAGP(context->current_tag), - &context->include); - } - string->size = 0; - context->current_attribute_number = INVALID; - if (c != '>') { - context->state = S_junk_tag; - } else { - context->current_tag = NULL; - context->state = S_text; - } - break; - } else { - /* - * Handle all other end tags normally. - FM - */ - end_element(context, context->current_tag); - } - } - -#ifdef USE_PRETTYSRC - if (psrc_view && !psrc_tagname_processed) { - PSRCSTART(abracket); - PUTS("</"); - PSRCSTOP(abracket); - PSRCSTART(tag); - if (tagname_transform != 1) { - if (tagname_transform == 0) - LYLowerCase(string->data); - else - LYUpperCase(string->data); - } - PUTS(string->data); - PSRCSTOP(tag); - if (c != '>') { - PSRCSTART(badtag); - PUTC(c); - } else { - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - } - } -#endif - - string->size = 0; - context->current_attribute_number = INVALID; - if (c != '>') { - if (!WHITE(c)) - CTRACE((tfp, "SGML: `</%s%c' found!\n", string->data, c)); - context->state = S_junk_tag; - } else { - context->current_tag = NULL; - context->state = S_text; - } - } - break; - - case S_esc: /* Expecting '$'or '(' following CJK ESC. */ - if (c == '$') { - context->state = S_dollar; - } else if (c == '(') { - context->state = S_paren; - } else { - context->state = S_text; - } - PUTC(c); - break; - - case S_dollar: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ - if (c == '@' || c == 'B' || c == 'A') { - context->state = S_nonascii_text; - } else if (c == '(') { - context->state = S_dollar_paren; - } - PUTC(c); - break; - - case S_dollar_paren: /* Expecting 'C' after CJK "ESC$(". */ - if (c == 'C') { - context->state = S_nonascii_text; - } else { - context->state = S_text; - } - PUTC(c); - break; - - case S_paren: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ - if (c == 'B' || c == 'J' || c == 'T') { - context->state = S_text; - } else if (c == 'I') { - context->state = S_nonascii_text; - } else { - context->state = S_text; - } - PUTC(c); - break; - - case S_nonascii_text: /* Expecting CJK ESC after non-ASCII text. */ - if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1264 */ - context->state = S_esc; - } - PUTC(c); - if (c < 32) - context->state = S_text; - break; - - case S_esc_sq: /* Expecting '$'or '(' following CJK ESC. */ - if (c == '$') { - context->state = S_dollar_sq; - } else if (c == '(') { - context->state = S_paren_sq; - } else { - context->state = S_squoted; - } - HTChunkPutc(string, c); - break; - - case S_dollar_sq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ - if (c == '@' || c == 'B' || c == 'A') { - context->state = S_nonascii_text_sq; - } else if (c == '(') { - context->state = S_dollar_paren_sq; - } - HTChunkPutc(string, c); - break; - - case S_dollar_paren_sq: /* Expecting 'C' after CJK "ESC$(". */ - if (c == 'C') { - context->state = S_nonascii_text_sq; - } else { - context->state = S_squoted; - } - HTChunkPutc(string, c); - break; - - case S_paren_sq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ - if (c == 'B' || c == 'J' || c == 'T') { - context->state = S_squoted; - } else if (c == 'I') { - context->state = S_nonascii_text_sq; - } else { - context->state = S_squoted; - } - HTChunkPutc(string, c); - break; - - case S_nonascii_text_sq: /* Expecting CJK ESC after non-ASCII text. */ - if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1281 */ - context->state = S_esc_sq; - } - HTChunkPutc(string, c); - break; - - case S_esc_dq: /* Expecting '$'or '(' following CJK ESC. */ - if (c == '$') { - context->state = S_dollar_dq; - } else if (c == '(') { - context->state = S_paren_dq; - } else { - context->state = S_dquoted; - } - HTChunkPutc(string, c); - break; - - case S_dollar_dq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ - if (c == '@' || c == 'B' || c == 'A') { - context->state = S_nonascii_text_dq; - } else if (c == '(') { - context->state = S_dollar_paren_dq; - } - HTChunkPutc(string, c); - break; - - case S_dollar_paren_dq: /* Expecting 'C' after CJK "ESC$(". */ - if (c == 'C') { - context->state = S_nonascii_text_dq; - } else { - context->state = S_dquoted; - } - HTChunkPutc(string, c); - break; - - case S_paren_dq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ - if (c == 'B' || c == 'J' || c == 'T') { - context->state = S_dquoted; - } else if (c == 'I') { - context->state = S_nonascii_text_dq; - } else { - context->state = S_dquoted; - } - HTChunkPutc(string, c); - break; - - case S_nonascii_text_dq: /* Expecting CJK ESC after non-ASCII text. */ - if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1298 */ - context->state = S_esc_dq; - } - HTChunkPutc(string, c); - break; - - case S_junk_tag: - case S_pi: - if (c == '>') { - HTChunkTerminate(string); -#ifdef USE_PRETTYSRC - if (psrc_view) { - if (context->state == S_junk_tag) { - PSRCSTOP(badtag); - } - PSRCSTART(abracket); - PUTC('>'); - PSRCSTOP(abracket); - } -#endif - if (context->state == S_pi) - handle_processing_instruction(context); - string->size = 0; - context->current_tag = NULL; - context->state = S_text; - } else { - HTChunkPutc(string, c); -#ifdef USE_PRETTYSRC - if (psrc_view) { - PUTC(c); - } -#endif - } - - } /* switch on context->state */ - CTRACE2(TRACE_SGML, (tfp, "SGML after %s|%.*s|%c|\n", - state_name(context->state), - string->size, - NonNull(string->data), - UCH(c))); - - after_switch: - /* - * Check whether an external function has added anything to the include - * buffer. If so, move the new stuff to the beginning of active_include. - * - kw - */ - if (context->include != NULL) { - if (context->include[0] == '\0') { - FREE(context->include); - } else { - if (context->active_include && - context->active_include[context->include_index] != '\0') - StrAllocCat(context->include, - context->active_include + context->include_index); - FREE(context->active_include); - context->active_include = context->include; - context->include_index = 0; - context->include = NULL; - } - } - - /* - * Check whether we've added anything to the recover buffer. - FM - */ - if (context->recover != NULL) { - if (context->recover[context->recover_index] == '\0') { - FREE(context->recover); - context->recover_index = 0; - } else { - c = context->recover[context->recover_index]; - context->recover_index++; - goto top; - } - } - - /* - * Check whether an external function had added anything to the include - * buffer; it should now be in active_include. - FM / kw - */ - if (context->active_include != NULL) { - if (context->active_include[context->include_index] == '\0') { - FREE(context->active_include); - context->include_index = 0; - } else { - if (context->current_tag_charset == UTF8_handle || - context->T.trans_from_uni) { - /* - * If it looks like we would have fed UTF-8 to the next - * processing stage, assume that whatever we were fed back is - * in UTF-8 form, too. This won't be always true for all uses - * of the include buffer, but it's a start. - kw - */ - char *puni = context->active_include + context->include_index; - - c = *puni; - clong = UCGetUniFromUtf8String(&puni); - if (clong < 256 && clong >= 0) { - c = ((char) (clong & 0xff)); - } - saved_char_in = '\0'; - context->include_index = puni - context->active_include + 1; - goto top1; - } else { - /* - * Otherwise assume no UTF-8 - do charset-naive processing and - * hope for the best. - kw - */ - c = context->active_include[context->include_index]; - context->include_index++; - goto top; - } - } - } - - /* - * Check whether an external function has added anything to the csi buffer. - * - FM - */ - if (context->csi != NULL) { - if (context->csi[context->csi_index] == '\0') { - FREE(context->csi); - context->csi_index = 0; - } else { - c = context->csi[context->csi_index]; - context->csi_index++; - goto top; - } - } -} /* SGML_character */ - -static void InferUtfFromBom(HTStream *context, int chndl) -{ - HTAnchor_setUCInfoStage(context->node_anchor, chndl, - UCT_STAGE_PARSER, - UCT_SETBY_PARSER); - change_chartrans_handling(context); -} - -/* - * Avoid rewrite of SGML_character() to handle hypothetical case of UTF-16 - * webpages, by pretending that the data is UTF-8. - */ -static void SGML_widechar(HTStream *context, long ch) -{ - if (!UCPutUtf8_charstring(context, SGML_character, ch)) { - SGML_character(context, UCH(ch)); - } -} - -static void SGML_write(HTStream *context, const char *str, int l) -{ - const char *p; - const char *e = str + l; - - if (sgml_offset == 0) { - if (l > 3 - && !memcmp(str, "\357\273\277", 3)) { - CTRACE((tfp, "SGML_write found UTF-8 BOM\n")); - InferUtfFromBom(context, UTF8_handle); - str += 3; - } else if (l > 2) { - if (!memcmp(str, "\377\376", 2)) { - CTRACE((tfp, "SGML_write found UCS-2 LE BOM\n")); - InferUtfFromBom(context, UTF8_handle); - str += 2; - context->T.ucs_mode = -1; - } else if (!memcmp(str, "\376\377", 2)) { - CTRACE((tfp, "SGML_write found UCS-2 BE BOM\n")); - InferUtfFromBom(context, UTF8_handle); - str += 2; - context->T.ucs_mode = 1; - } - } - } - switch (context->T.ucs_mode) { - case -1: - for (p = str; p < e; p += 2) - SGML_widechar(context, (UCH(p[1]) << 8) | UCH(p[0])); - break; - case 1: - for (p = str; p < e; p += 2) - SGML_widechar(context, (UCH(p[0]) << 8) | UCH(p[1])); - break; - default: - for (p = str; p < e; p++) - SGML_character(context, *p); - break; - } -} - -static void SGML_string(HTStream *context, const char *str) -{ - SGML_write(context, str, strlen(str)); -} - -/*_______________________________________________________________________ -*/ - -/* Structured Object Class - * ----------------------- - */ -const HTStreamClass SGMLParser = -{ - "SGMLParser", - SGML_free, - SGML_abort, - SGML_character, - SGML_string, - SGML_write, -}; - -/* Create SGML Engine - * ------------------ - * - * On entry, - * dtd represents the DTD, along with - * actions is the sink for the data as a set of routines. - * - */ - -HTStream *SGML_new(const SGML_dtd * dtd, - HTParentAnchor *anchor, - HTStructured * target) -{ - HTStream *context = typecalloc(struct _HTStream); - - if (!context) - outofmem(__FILE__, "SGML_begin"); - - context->isa = &SGMLParser; - context->string = HTChunkCreate(128); /* Grow by this much */ - context->dtd = dtd; - context->target = target; - context->actions = (const HTStructuredClass *) (((HTStream *) target)->isa); - /* Ugh: no OO */ - context->unknown_tag = &HTTag_unrecognized; - context->current_tag = context->slashedtag = NULL; - context->state = S_text; -#ifdef CALLERDATA - context->callerData = (void *) callerData; -#endif /* CALLERDATA */ - - context->node_anchor = anchor; /* Could be NULL? */ - context->utf_buf_p = context->utf_buf; - UCTransParams_clear(&context->T); - context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor, - UCT_STAGE_PARSER); - if (context->inUCLYhndl < 0) { - HTAnchor_copyUCInfoStage(anchor, - UCT_STAGE_PARSER, - UCT_STAGE_MIME, - -1); - context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor, - UCT_STAGE_PARSER); - } -#ifdef CAN_SWITCH_DISPLAY_CHARSET /* Allow a switch to a more suitable display charset */ - else if (anchor->UCStages - && anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl >= 0 - && anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl != current_char_set) { - int o = anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl; - - anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl = -1; /* Force reset */ - HTAnchor_resetUCInfoStage(anchor, o, UCT_STAGE_PARSER, - /* Preserve change this: */ - anchor->UCStages->s[UCT_STAGE_PARSER].lock); - } -#endif - - context->inUCI = HTAnchor_getUCInfoStage(anchor, - UCT_STAGE_PARSER); - set_chartrans_handling(context, anchor, -1); - - context->recover = NULL; - context->recover_index = 0; - context->include = NULL; - context->active_include = NULL; - context->include_index = 0; - context->url = NULL; - context->csi = NULL; - context->csi_index = 0; - -#ifdef USE_PRETTYSRC - if (psrc_view) { - psrc_view = FALSE; - mark_htext_as_source = TRUE; - SGML_string(context, - "<HTML><HEAD><TITLE>source</TITLE></HEAD><BODY><PRE>"); - psrc_view = TRUE; - psrc_convert_string = FALSE; - sgml_in_psrc_was_initialized = TRUE; - } -#endif - - sgml_offset = 0; - return context; -} - -/* - * Return the offset within the document where we're parsing. This is used - * to help identify anchors which shift around while reparsing. - */ -int SGML_offset(void) -{ - int result = sgml_offset; - -#ifdef USE_PRETTYSRC - result += psrc_view; -#endif - return result; -} - -/* Asian character conversion functions - * ==================================== - * - * Added 24-Mar-96 by FM, based on: - * - //////////////////////////////////////////////////////////////////////// -Copyright (c) 1993 Electrotechnical Laboratory (ETL) - -Permission to use, copy, modify, and distribute this material -for any purpose and without fee is hereby granted, provided -that the above copyright notice and this permission notice -appear in all copies, and that the name of ETL not be -used in advertising or publicity pertaining to this -material without the specific, prior written permission -of an authorized representative of ETL. -ETL MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY -OF THIS MATERIAL FOR ANY PURPOSE. IT IS PROVIDED "AS IS", -WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. -///////////////////////////////////////////////////////////////////////// -Content-Type: program/C; charset=US-ASCII -Program: SJIS.c -Author: Yutaka Sato <ysato@etl.go.jp> -Description: -History: - 930923 extracted from codeconv.c of cosmos -/////////////////////////////////////////////////////////////////////// -*/ - -static int TREAT_SJIS = 1; - -void JISx0201TO0208_EUC(unsigned char IHI, - unsigned char ILO, - unsigned char *OHI, - unsigned char *OLO) -{ - static const char *table[] = - { - "\241\243", /* A1,A3 */ - "\241\326", /* A1,D6 */ - "\241\327", /* A1,D7 */ - "\241\242", /* A1,A2 */ - "\241\246", /* A1,A6 */ - "\245\362", /* A5,F2 */ - "\245\241", /* A5,A1 */ - "\245\243", /* A5,A3 */ - "\245\245", /* A5,A5 */ - "\245\247", /* A5,A7 */ - "\245\251", /* A5,A9 */ - "\245\343", /* A5,E3 */ - "\245\345", /* A5,E5 */ - "\245\347", /* A5,E7 */ - "\245\303", /* A5,C3 */ - "\241\274", /* A1,BC */ - "\245\242", /* A5,A2 */ - "\245\244", /* A5,A4 */ - "\245\246", /* A5,A6 */ - "\245\250", /* A5,A8 */ - "\245\252", /* A5,AA */ - "\245\253", /* A5,AB */ - "\245\255", /* A5,AD */ - "\245\257", /* A5,AF */ - "\245\261", /* A5,B1 */ - "\245\263", /* A5,B3 */ - "\245\265", /* A5,B5 */ - "\245\267", /* A5,B7 */ - "\245\271", /* A5,B9 */ - "\245\273", /* A5,BB */ - "\245\275", /* A5,BD */ - "\245\277", /* A5,BF */ - "\245\301", /* A5,C1 */ - "\245\304", /* A5,C4 */ - "\245\306", /* A5,C6 */ - "\245\310", /* A5,C8 */ - "\245\312", /* A5,CA */ - "\245\313", /* A5,CB */ - "\245\314", /* A5,CC */ - "\245\315", /* A5,CD */ - "\245\316", /* A5,CE */ - "\245\317", /* A5,CF */ - "\245\322", /* A5,D2 */ - "\245\325", /* A5,D5 */ - "\245\330", /* A5,D8 */ - "\245\333", /* A5,DB */ - "\245\336", /* A5,DE */ - "\245\337", /* A5,DF */ - "\245\340", /* A5,E0 */ - "\245\341", /* A5,E1 */ - "\245\342", /* A5,E2 */ - "\245\344", /* A5,E4 */ - "\245\346", /* A5,E6 */ - "\245\350", /* A5,E8 */ - "\245\351", /* A5,E9 */ - "\245\352", /* A5,EA */ - "\245\353", /* A5,EB */ - "\245\354", /* A5,EC */ - "\245\355", /* A5,ED */ - "\245\357", /* A5,EF */ - "\245\363", /* A5,F3 */ - "\241\253", /* A1,AB */ - "\241\254" /* A1,AC */ - }; - - if ((IHI == 0x8E) && (ILO >= 0xA1) && (ILO <= 0xDF)) { - *OHI = table[ILO - 0xA1][0]; - *OLO = table[ILO - 0xA1][1]; - } else { - *OHI = IHI; - *OLO = ILO; - } -} - -static int IS_SJIS_STR(const unsigned char *str) -{ - const unsigned char *s; - unsigned char ch; - int is_sjis = 0; - - s = str; - while ((ch = *s++) != '\0') { - if (ch & 0x80) - if (IS_SJIS(ch, *s, is_sjis)) - return 1; - } - return 0; -} - -unsigned char *SJIS_TO_JIS1(register unsigned char HI, - register unsigned char LO, - register unsigned char *JCODE) -{ - HI -= UCH((HI <= 0x9F) ? 0x71 : 0xB1); - HI = UCH((HI << 1) + 1); - if (0x7F < LO) - LO--; - if (0x9E <= LO) { - LO -= UCH(0x7D); - HI++; - } else { - LO -= UCH(0x1F); - } - JCODE[0] = HI; - JCODE[1] = LO; - return JCODE; -} - -unsigned char *JIS_TO_SJIS1(register unsigned char HI, - register unsigned char LO, - register unsigned char *SJCODE) -{ - if (HI & 1) - LO += UCH(0x1F); - else - LO += UCH(0x7D); - if (0x7F <= LO) - LO++; - - HI = UCH(((HI - 0x21) >> 1) + 0x81); - if (0x9F < HI) - HI += UCH(0x40); - SJCODE[0] = HI; - SJCODE[1] = LO; - return SJCODE; -} - -unsigned char *EUC_TO_SJIS1(unsigned char HI, - unsigned char LO, - register unsigned char *SJCODE) -{ - if (HI == 0x8E) - JISx0201TO0208_EUC(HI, LO, &HI, &LO); - JIS_TO_SJIS1(UCH(HI & 0x7F), UCH(LO & 0x7F), SJCODE); - return SJCODE; -} - -void JISx0201TO0208_SJIS(register unsigned char I, - register unsigned char *OHI, - register unsigned char *OLO) -{ - unsigned char SJCODE[2]; - - JISx0201TO0208_EUC(0x8E, I, OHI, OLO); - JIS_TO_SJIS1(UCH(*OHI & 0x7F), UCH(*OLO & 0x7F), SJCODE); - *OHI = SJCODE[0]; - *OLO = SJCODE[1]; -} - -unsigned char *SJIS_TO_EUC1(unsigned char HI, - unsigned char LO, - unsigned char *data) -{ - SJIS_TO_JIS1(HI, LO, data); - data[0] |= 0x80; - data[1] |= 0x80; - return data; -} - -unsigned char *SJIS_TO_EUC(unsigned char *src, - unsigned char *dst) -{ - register unsigned char hi, lo, *sp, *dp; - register int in_sjis = 0; - - in_sjis = IS_SJIS_STR(src); - for (sp = src, dp = dst; (hi = sp[0]) != '\0';) { - lo = sp[1]; - if (TREAT_SJIS && IS_SJIS(hi, lo, in_sjis)) { - SJIS_TO_JIS1(hi, lo, dp); - dp[0] |= 0x80; - dp[1] |= 0x80; - dp += 2; - sp += 2; - } else - *dp++ = *sp++; - } - *dp = 0; - return dst; -} - -unsigned char *EUC_TO_SJIS(unsigned char *src, - unsigned char *dst) -{ - register unsigned char *sp, *dp; - - for (sp = src, dp = dst; *sp;) { - if (*sp & 0x80) { - if (sp[1] && (sp[1] & 0x80)) { - JIS_TO_SJIS1(UCH(sp[0] & 0x7F), UCH(sp[1] & 0x7F), dp); - dp += 2; - sp += 2; - } else { - sp++; - } - } else { - *dp++ = *sp++; - } - } - *dp = 0; - return dst; -} - -#define Strcpy(a,b) (strcpy((char*)a,(const char*)b),&a[strlen((const char*)a)]) - -unsigned char *EUC_TO_JIS(unsigned char *src, - unsigned char *dst, - const char *toK, - const char *toA) -{ - register unsigned char kana_mode = 0; - register unsigned char cch; - register unsigned char *sp = src; - register unsigned char *dp = dst; - int is_JIS = 0; - - while ((cch = *sp++) != '\0') { - if (cch & 0x80) { - if (!IS_EUC(cch, *sp)) { - if (cch == 0xA0 && is_JIS) /* ignore NBSP */ - continue; - is_JIS++; - *dp++ = cch; - continue; - } - if (!kana_mode) { - kana_mode = UCH(~kana_mode); - dp = Strcpy(dp, toK); - } - if (*sp & 0x80) { - *dp++ = UCH(cch & ~0x80); - *dp++ = UCH(*sp++ & ~0x80); - } - } else { - if (kana_mode) { - kana_mode = UCH(~kana_mode); - dp = Strcpy(dp, toA); - } - *dp++ = cch; - } - } - if (kana_mode) - dp = Strcpy(dp, toA); - - if (dp) - *dp = 0; - return dst; -} - -#define IS_JIS7(c1,c2) (0x20<(c1)&&(c1)<0x7F && 0x20<(c2)&&(c2)<0x7F) -#define SO ('N'-0x40) -#define SI ('O'-0x40) - -static int repair_JIS = 0; - -static const unsigned char *repairJIStoEUC(const unsigned char *src, - unsigned char **dstp) -{ - const unsigned char *s; - unsigned char *d, ch1, ch2; - - d = *dstp; - s = src; - while ((ch1 = s[0]) && (ch2 = s[1])) { - s += 2; - if (ch1 == '(') - if (ch2 == 'B' || ch2 == 'J') { - *dstp = d; - return s; - } - if (!IS_JIS7(ch1, ch2)) - return 0; - - *d++ = UCH(0x80 | ch1); - *d++ = UCH(0x80 | ch2); - } - return 0; -} - -unsigned char *TO_EUC(const unsigned char *jis, - unsigned char *euc) -{ - register const unsigned char *s; - register unsigned char c, jis_stat; - unsigned char *d; - register int to1B, to2B; - register int in_sjis = 0; - static int nje; - int n8bits; - int is_JIS; - - nje++; - n8bits = 0; - s = jis; - d = euc; - jis_stat = 0; - to2B = TO_2BCODE; - to1B = TO_1BCODE; - in_sjis = IS_SJIS_STR(jis); - is_JIS = 0; - - while ((c = *s++) != '\0') { - if (c == 0x80) - continue; /* ignore it */ - if (c == 0xA0 && is_JIS) - continue; /* ignore Non-breaking space */ - - if (c == to2B && jis_stat == 0 && repair_JIS) { - if (*s == 'B' || *s == '@') { - const unsigned char *ts; - - if ((ts = repairJIStoEUC(s + 1, &d)) != NULL) { - s = ts; - continue; - } - } - } - if (c == CH_ESC) { - if (*s == to2B) { - if ((s[1] == 'B') || (s[1] == '@')) { - jis_stat = 0x80; - s += 2; - is_JIS++; - continue; - } - jis_stat = 0; - } else if (*s == to1B) { - jis_stat = 0; - if ((s[1] == 'B') || (s[1] == 'J') || (s[1] == 'H')) { - s += 2; - continue; - } - } else if (*s == ',') { /* MULE */ - jis_stat = 0; - } - } - if (c & 0x80) - n8bits++; - - if (IS_SJIS(c, *s, in_sjis)) { - SJIS_TO_EUC1(c, *s, d); - d += 2; - s++; - is_JIS++; - } else if (jis_stat) { - if (c <= 0x20 || 0x7F <= c) { - *d++ = c; - if (c == '\n') - jis_stat = 0; - } else { - if (IS_JIS7(c, *s)) { - *d++ = jis_stat | c; - *d++ = jis_stat | *s++; - } else - *d++ = c; - } - } else { - if (n8bits == 0 && (c == SI || c == SO)) { - } else { - *d++ = c; - } - } - } - *d = 0; - return euc; -} - -#define non94(ch) ((ch) <= 0x20 || (ch) == 0x7F) - -static int is_EUC_JP(unsigned char *euc) -{ - unsigned char *cp; - int ch1, ch2; - - for (cp = euc; (ch1 = *cp) != '\0'; cp++) { - if (ch1 & 0x80) { - ch2 = cp[1] & 0xFF; - if ((ch2 & 0x80) == 0) { - /* sv1log("NOT_EUC1[%x][%x]\n",ch1,ch2); */ - return 0; - } - if (non94(ch1 & 0x7F) || non94(ch2 & 0x7F)) { - /* sv1log("NOT_EUC2[%x][%x]\n",ch1,ch2); */ - return 0; - } - cp++; - } - } - return 1; -} - -void TO_SJIS(const unsigned char *arg, - unsigned char *sjis) -{ - unsigned char *euc; - - euc = typeMallocn(unsigned char, strlen((const char *) arg) + 1); - -#ifdef CJK_EX - if (!euc) - outofmem(__FILE__, "TO_SJIS"); -#endif - TO_EUC(arg, euc); - if (is_EUC_JP(euc)) - EUC_TO_SJIS(euc, sjis); - else - strcpy((char *) sjis, (const char *) arg); - free(euc); -} - -void TO_JIS(const unsigned char *arg, - unsigned char *jis) -{ - unsigned char *euc; - - if (arg[0] == 0) { - jis[0] = 0; - return; - } - euc = typeMallocn(unsigned char, strlen((const char *)arg) + 1); -#ifdef CJK_EX - if (!euc) - outofmem(__FILE__, "TO_JIS"); -#endif - TO_EUC(arg, euc); - is_EUC_JP(euc); - EUC_TO_JIS(euc, jis, TO_KANJI, TO_ASCII); - - free(euc); -} |