diff options
Diffstat (limited to 'WWW/Library/Implementation/SGML.c')
-rw-r--r-- | WWW/Library/Implementation/SGML.c | 235 |
1 files changed, 164 insertions, 71 deletions
diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c index 51434a66..2a210561 100644 --- a/WWW/Library/Implementation/SGML.c +++ b/WWW/Library/Implementation/SGML.c @@ -44,8 +44,6 @@ PUBLIC BOOL HTPassEightBitNum = FALSE; /* Pass ^ numeric entities raw. */ PUBLIC BOOL HTPassHighCtrlRaw = FALSE; /* Pass 127-160,173, raw. */ PUBLIC BOOL HTPassHighCtrlNum = FALSE; /* Pass €-Ÿ raw. */ -extern BOOLEAN LYCheckForCSI PARAMS((HTParentAnchor *anchor, char **url)); -extern void LYDoCSI PARAMS((char *url, CONST char *comment, char **csi)); /* The State (context) of the parser ** @@ -118,8 +116,8 @@ struct _HTStream { LYUCcharset * htmlUCI; /* anchor UCInfo for target */ int html_char_set; /* feed it to target stream */ char utf_count; - long utf_char; - char utf_buf[7]; + UCode_t utf_char; + char utf_buf[8]; char * utf_buf_p; UCTransParams T; int current_tag_charset; /* charset to pass attributes */ @@ -137,9 +135,9 @@ struct _HTStream { #ifdef EXP_CHARTRANS PRIVATE void set_chartrans_handling ARGS3( - HTStream *, context, - HTParentAnchor *, anchor, - int, chndl) + HTStream *, context, + HTParentAnchor *, anchor, + int, chndl) { extern int current_char_set; @@ -150,9 +148,9 @@ PRIVATE void set_chartrans_handling ARGS3( if (chndl < 0) chndl = current_char_set; HTAnchor_setUCInfoStage(anchor, chndl, UCT_STAGE_HTEXT, - UCT_SETBY_DEFAULT); + UCT_SETBY_DEFAULT); HTAnchor_setUCInfoStage(anchor, chndl, UCT_STAGE_STRUCTURED, - UCT_SETBY_DEFAULT); + UCT_SETBY_DEFAULT); context->htmlUCI = HTAnchor_getUCInfoStage(anchor, UCT_STAGE_STRUCTURED); context->html_char_set = HTAnchor_getUCLYhndl(context->node_anchor, @@ -178,7 +176,7 @@ PRIVATE void set_chartrans_handling ARGS3( } PRIVATE void change_chartrans_handling ARGS1( - HTStream *, context) + HTStream *, context) { int new_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor, UCT_STAGE_PARSER); @@ -301,22 +299,28 @@ PRIVATE void handle_attribute_value ARGS2( /* translate some Unicodes to Lynx special codes and output them. */ PRIVATE BOOL put_special_unicodes ARGS2( HTStream *, context, - long, code) + UCode_t, code) { if (code == 160) { + /* + ** Use Lynx special character for nbsp. + */ PUTC(HT_NON_BREAK_SPACE); - } else if (code==173) { + } else if (code == 173) { + /* + ** Use Lynx special character for shy. + */ PUTC(LY_SOFT_HYPHEN); } else if (code == 8194 || code == 8195 || code == 8201) { - /* - ** ensp, emsp or thinsp. - */ + /* + ** Use Lynx special character for ensp, emsp or thinsp. + */ PUTC(HT_EM_SPACE); } else if (code == 8211 || code == 8212) { - /* - ** ndash or mdash. - */ - PUTC('-'); + /* + ** Use ASCII hyphen for ndash/endash or mdash/emdash. + */ + PUTC('-'); } else { /* ** Return NO if nothing done. @@ -342,11 +346,11 @@ PRIVATE BOOL put_special_unicodes ARGS2( ** Modified SGML_character() so we only come here with terminator ** as '\0' and check a FoundEntity flag. -- Foteos Macrides ** -** Modified more (for use with CHARTRANS): +** Modified more (for use with Lynx character translation code): */ #ifdef EXP_CHARTRANS -PRIVATE char replace_buf [61]; /* buffer for replacement strings */ +PRIVATE char replace_buf [64]; /* buffer for replacement strings */ #endif PRIVATE BOOL FoundEntity = FALSE; @@ -356,11 +360,9 @@ PRIVATE void handle_entity ARGS2( char, term) { CONST char ** entities = context->dtd->entity_names; -#ifdef EXP_CHARTRANS CONST UC_entity_info * extra_entities = context->dtd->extra_entity_info; extern int current_char_set; int rc; -#endif CONST char *s = context->string->data; int high, low, i, diff; @@ -451,7 +453,7 @@ PRIVATE void handle_entity ARGS2( ** If entity string not found, display as text. */ if (TRACE) - fprintf(stderr, "SGML: Unknown entity %s\n", s); + fprintf(stderr, "SGML: Unknown entity '%s'\n", s); PUTC('&'); { CONST char *p; @@ -477,7 +479,7 @@ PRIVATE void handle_comment ARGS1( if (context->csi == NULL && strncmp(s, "!--#", 4) == 0 && - LYCheckForCSI(context->node_anchor, (char **)&context->url) == TRUE) { + LYCheckForCSI(context->target, (char **)&context->url) == TRUE) { LYDoCSI(context->url, s, (char **)&context->csi); } @@ -871,7 +873,7 @@ PUBLIC HTTag * SGMLFindTag ARGS2( int high, low, i, diff; for (low = 0, high=dtd->number_of_tags; high > low; - diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */ + diff < 0 ? (low = i+1) : (high = i)) { /* Binary search */ i = (low + (high-low)/2); diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */ if (diff == 0) { /* success: found it */ @@ -1005,20 +1007,19 @@ PUBLIC void SGML_character ARGS2( HTChunk *string = context->string; CONST char * EntityName; extern int current_char_set; - extern CONST char * LYchar_set_names[]; extern CONST char * HTMLGetEntityName PARAMS((int i)); #ifdef EXP_CHARTRANS extern int LYlowest_eightbit[]; char * p; BOOLEAN chk; /* Helps (?) walk through all the else ifs... */ - long clong, uck; /* Enough bits for UCS4 ... */ + UCode_t clong, uck; /* Enough bits for UCS4 ... */ char c; char saved_char_in = '\0'; /* - ** Now some fun with the preprocessor... - ** use copies for c an unsign_c == clong, so that we - ** can revert back to the unchanged c_in. + ** Now some fun with the preprocessor. + ** Use copies for c and unsign_c == clong, so that + ** we can revert back to the unchanged c_in. - KW */ #define unsign_c clong @@ -1035,9 +1036,9 @@ PUBLIC void SGML_character ARGS2( /* ** Combine UTF-8 into Unicode. ** Incomplete characters silently ignored. - ** From Linux kernel's console.c. + ** From Linux kernel's console.c. - KW */ - if((unsigned char)c > 0x7f) { + if ((unsigned char)c > 127) { if (context->utf_count > 0 && (c & 0xc0) == 0x80) { context->utf_char = (context->utf_char << 6) | (c & 0x3f); context->utf_count--; @@ -1051,7 +1052,7 @@ PUBLIC void SGML_character ARGS2( goto top1; } else { /* - ** Wait for more. + ** Wait for more. - KW */ return; } @@ -1075,20 +1076,20 @@ PUBLIC void SGML_character ARGS2( context->utf_char = (c & 0x01); } else { /* - ** Garbage. + ** Garbage. - KW */ context->utf_count = 0; context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } /* - ** Wait for more. + ** Wait for more. - KW */ return; } } else { /* - ** Got an ASCII char. + ** Got an ASCII char. - KW */ context->utf_count = 0; context->utf_buf_p = context->utf_buf; @@ -1101,9 +1102,9 @@ PUBLIC void SGML_character ARGS2( saved_char_in = c; if (context->T.trans_to_uni && - (unsign_c >= 127 || - (unsign_c < 32 && unsign_c != 0 && - context->T.trans_C0_to_uni))) { + ((unsign_c >= 127) || + (unsign_c < 32 && unsign_c != 0 && + context->T.trans_C0_to_uni))) { clong = UCTransToUni(c, context->in_char_set); if (clong > 0) { saved_char_in = c; @@ -1233,7 +1234,7 @@ top1: } if (c == '&' && unsign_c < 127 && (!context->element_stack || - (context->element_stack->tag && + (context->element_stack->tag && (context->element_stack->tag->contents == SGML_MIXED || context->element_stack->tag->contents == SGML_PCDATA || context->element_stack->tag->contents == SGML_RCDATA)))) { @@ -1276,7 +1277,7 @@ top1: } else if (context->T.use_raw_char_in && saved_char_in) { /* ** Only if the original character is still in saved_char_in, - ** otherwise we may be iterating from a goto top + ** otherwise we may be iterating from a goto top. - KW */ PUTC(saved_char_in); saved_char_in = '\0'; @@ -1305,10 +1306,15 @@ top1: 0) >= 0)) { /* ** No further tests for valididy - assume that whoever - ** defined replacement strings knew what she was doing. + ** defined replacement strings knew what she was doing. - KW */ - for (p=replace_buf; *p; p++) + for (p = replace_buf; *p; p++) PUTC(*p); + /* + ** If we're displaying UTF-8, try that now. - FM + */ + } else if (context->T.output_utf8 && PUTUTF8(clong)) { + ; /* do nothing more */ #endif /* EXP_CHARTRANS */ /* @@ -1321,11 +1327,14 @@ top1: #define PASSHI8BIT HTPassEightBitRaw #else #define PASSHI8BIT (HTPassEightBitRaw || (context->T.do_8bitraw && !context->T.trans_from_uni)) +#define IncludesLatin1Enc(cs) \ + (cs == 0 || \ + (context->htmlUCI && \ + (context->htmlUCI->enc & (UCT_CP_SUPERSETOF_LAT1)))) #endif /* EXP_CHARTRANS */ } else if (unsign_c > 160 && unsign_c < 256 && !(PASSHI8BIT || HTCJK != NOCJK) && - strncmp(LYchar_set_names[current_char_set], - "ISO Latin 1", 11)) { + !IncludesLatin1Enc(current_char_set)) { int i; int value; @@ -1347,10 +1356,10 @@ top1: PUTC(c); /* ** If we get to here, and should have translated, - ** translation has failed so far. + ** translation has failed so far. - KW */ } else if (context->T.output_utf8 && *context->utf_buf) { - for (p=context->utf_buf; *p; p++) + for (p = context->utf_buf; *p; p++) PUTC(*p); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; @@ -1360,15 +1369,15 @@ top1: ((unsigned char)saved_char_in < 255)) { /* ** KOI8 special: strip high bit, gives (somewhat) readable - ** ASCII or KOI7 - it was constructed that way! + ** ASCII or KOI7 - it was constructed that way! - KW */ PUTC((char)(saved_char_in & 0x7f)); saved_char_in = '\0'; } else if ((unsigned char)c < LYlowest_eightbit[context->html_char_set] || (context->T.trans_from_uni && !HTPassEightBitRaw)) { - sprintf(replace_buf,"U%.2lx",unsign_c); - for (p=replace_buf; *p; p++) + sprintf(replace_buf, "U%.2lX", unsign_c); + for (p = replace_buf; *p; p++) PUTC(*p); #endif /* EXP_CHARTRANS */ /* @@ -1383,7 +1392,7 @@ top1: ** In litteral mode, waits only for specific end tag (for ** compatibility with old servers, and for Lynx). - FM */ - case S_litteral : + case S_litteral: HTChunkPutc(string, c); if (TOUPPER(c) != ((string->size == 1) ? '/' : @@ -1448,7 +1457,36 @@ top1: ** Terminate entity name and try to handle it. - FM */ HTChunkTerminate(string); - handle_entity(context, '\0'); + if (!strcmp(string->data, "zwnj") && + (!context->element_stack || + (context->element_stack->tag && + context->element_stack->tag->contents == SGML_MIXED))) { + /* + ** Handle zwnj (8204) as <WBR>. - FM + */ + char temp[8]; + + if (TRACE) { + fprintf(stderr, + "SGML_character: Handling 'zwnj' entity as 'WBR' element.\n"); + } + if (c != ';') { + sprintf(temp, "<WBR>%c", c); + } else { + sprintf(temp, "<WBR>"); + } + if (context->recover == NULL) { + StrAllocCopy(context->recover, temp); + context->recover_index = 0; + } else { + StrAllocCat(context->recover, temp); + } + string->size = 0; + context->state = S_text; + break; + } else { + handle_entity(context, '\0'); + } string->size = 0; context->state = S_text; /* @@ -1518,6 +1556,46 @@ top1: HTChunkTerminate(string); if ((context->isHex ? sscanf(string->data, "%x", &value) : sscanf(string->data, "%d", &value)) == 1) { + /* + ** Check for special values. - FM + */ + if ((value == 8204) && + (!context->element_stack || + (context->element_stack->tag && + context->element_stack->tag->contents == SGML_MIXED))) { + /* + ** Handle zwnj (8204) as <WBR>. - FM + */ + char temp[8]; + + if (TRACE) { + fprintf(stderr, + "SGML_character: Handling '8204' (zwnj) reference as 'WBR' element.\n"); + } + /* + ** Include the terminator if it is not + ** the standard semi-colon. - FM + */ + if (c != ';') { + sprintf(temp, "<WBR>%c", c); + } else { + sprintf(temp, "<WBR>"); + } + /* + ** Add the replacement string to the + ** recover buffer for processing. - FM + */ + if (context->recover == NULL) { + StrAllocCopy(context->recover, temp); + context->recover_index = 0; + } else { + StrAllocCat(context->recover, temp); + } + string->size = 0; + context->isHex = FALSE; + context->state = S_text; + break; + } #ifdef EXP_CHARTRANS if (value == 160 || value == 173) { /* @@ -1546,13 +1624,13 @@ top1: uck >= LYlowest_eightbit[context->html_char_set])) { if (uck == 160 && current_char_set == 0) { /* - ** Would only happen if some other unicode + ** Would only happen if some other Unicode ** is mapped to Latin-1 160. */ PUTC(HT_NON_BREAK_SPACE); } else if (uck == 173 && current_char_set == 0) { /* - ** Would only happen if some other unicode + ** Would only happen if some other Unicode ** is mapped to Latin-1 173. */ PUTC(LY_SOFT_HYPHEN); @@ -1593,13 +1671,13 @@ top1: ** and the value: ** (1) Is greater than 255 (but use ASCII characters ** for spaces or dashes). - ** (2) Is less than 32, and not valid or we don't - ** have HTCJK set. - ** (3) Is 127 and we don't have HTPassHighCtrlRaw or - ** HTCJK set. - ** (4) Is 128 - 159 and we don't have HTPassHighCtrlNum - ** set. - ** - FM + ** (2) Is less than 32, and not valid or we don't + ** have HTCJK set. + ** (3) Is 127 and we don't have HTPassHighCtrlRaw or + ** HTCJK set. + ** (4) Is 128 - 159 and we don't have HTPassHighCtrlNum + ** set. + ** - FM */ } else if ((value > 255) || (value < 32 && @@ -1640,8 +1718,7 @@ top1: goto top1; } } else if (value < 161 || HTPassEightBitNum || - !strncmp(LYchar_set_names[current_char_set], - "ISO Latin 1", 11)) { + IncludesLatin1Enc(current_char_set)) { /* ** No conversion needed. - FM */ @@ -1696,6 +1773,7 @@ top1: ** the "standard" semi-colon for HTML. - FM */ if (c != ';') + goto top1; } else { /* @@ -1814,7 +1892,7 @@ top1: */ { int i; - for (i=0; i< context->current_tag->number_of_attributes; i++) + for (i = 0; i < context->current_tag->number_of_attributes; i++) context->present[i] = NO; } string->size = 0; @@ -2188,13 +2266,18 @@ top1: else context->state = S_tag_gap; #ifdef EXP_CHARTRANS } else if (context->T.decode_utf8 && - *context->utf_buf) { + *context->utf_buf) { HTChunkPuts(string, context->utf_buf); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } else if (HTCJK == NOCJK && (context->T.output_utf8 || context->T.trans_from_uni)) { - HTChunkPutUtf8Char(string, clong); + if (clong == 0xfffd && saved_char_in && HTPassEightBitRaw && + (unsigned char)saved_char_in >= LYlowest_eightbit[current_char_set]) + HTChunkPutUtf8Char(string, + (0xf000 | (unsigned char)saved_char_in)); + else + HTChunkPutUtf8Char(string, clong); } else if (saved_char_in && context->T.use_raw_char_in) { HTChunkPutc(string, saved_char_in); #endif /* EXP_CHARTRANS */ @@ -2218,13 +2301,18 @@ top1: HTChunkPutc(string, c); #ifdef EXP_CHARTRANS } else if (context->T.decode_utf8 && - *context->utf_buf) { + *context->utf_buf) { HTChunkPuts(string, context->utf_buf); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } else if (HTCJK == NOCJK && (context->T.output_utf8 || context->T.trans_from_uni)) { - HTChunkPutUtf8Char(string, clong); + if (clong == 0xfffd && saved_char_in && HTPassEightBitRaw && + (unsigned char)saved_char_in >= LYlowest_eightbit[current_char_set]) + HTChunkPutUtf8Char(string, + (0xf000 | (unsigned char)saved_char_in)); + else + HTChunkPutUtf8Char(string, clong); } else if (saved_char_in && context->T.use_raw_char_in) { HTChunkPutc(string, saved_char_in); #endif /* EXP_CHARTRANS */ @@ -2252,13 +2340,18 @@ top1: HTChunkPutc(string, c); #ifdef EXP_CHARTRANS } else if (context->T.decode_utf8 && - *context->utf_buf) { + *context->utf_buf) { HTChunkPuts(string, context->utf_buf); context->utf_buf_p = context->utf_buf; *(context->utf_buf_p) = '\0'; } else if (HTCJK == NOCJK && (context->T.output_utf8 || context->T.trans_from_uni)) { - HTChunkPutUtf8Char(string, clong); + if (clong == 0xfffd && saved_char_in && HTPassEightBitRaw && + (unsigned char)saved_char_in >= LYlowest_eightbit[current_char_set]) + HTChunkPutUtf8Char(string, + (0xf000 | (unsigned char)saved_char_in)); + else + HTChunkPutUtf8Char(string, clong); } else if (saved_char_in && context->T.use_raw_char_in) { HTChunkPutc(string, saved_char_in); #endif /* EXP_CHARTRANS */ |