diff options
Diffstat (limited to 'WWW/Library/Implementation/HTPlain.c')
-rw-r--r-- | WWW/Library/Implementation/HTPlain.c | 722 |
1 files changed, 0 insertions, 722 deletions
diff --git a/WWW/Library/Implementation/HTPlain.c b/WWW/Library/Implementation/HTPlain.c deleted file mode 100644 index b3e80c6f..00000000 --- a/WWW/Library/Implementation/HTPlain.c +++ /dev/null @@ -1,722 +0,0 @@ -/* - * $LynxId: HTPlain.c,v 1.49 2011/06/11 12:09:07 tom Exp $ - * - * Plain text object HTWrite.c - * ================= - * - * This version of the stream object just writes to a socket. - * The socket is assumed open and left open. - * - * Bugs: - * strings written must be less than buffer size. - */ - -#define HTSTREAM_INTERNAL 1 - -#include <HTUtils.h> -#include <LYCharVals.h> /* S/390 -- gil -- 0288 */ - -#include <HTPlain.h> - -#include <HTChunk.h> -#include <HText.h> -#include <HTStyle.h> -#define Lynx_HTML_Handler -#include <HTML.h> /* styles[] */ - -#define BUFFER_SIZE 4096; /* Tradeoff */ - -#include <HTMLDTD.h> -#include <HTCJK.h> -#include <UCMap.h> -#include <UCDefs.h> -#include <UCAux.h> - -#include <LYCharSets.h> -#include <LYStrings.h> -#include <LYLeaks.h> - -static int HTPlain_lastraw = -1; -static int HTPlain_bs_pending = 0; /* 1:bs 2:underline 3:underline+bs - kw */ - -/* HTML Object - * ----------- - */ -struct _HTStream { - const HTStreamClass *isa; - HText *text; - /* - * The node_anchor UCInfo and handle for the input (PARSER) stage. - FM - */ - LYUCcharset *inUCI; - int inUCLYhndl; - /* - * The node_anchor UCInfo and handle for the output (HTEXT) stage. - FM - */ - LYUCcharset *outUCI; - int outUCLYhndl; - /* - * Counter, value, buffer and pointer for UTF-8 handling. - FM - */ - char utf_count; - UCode_t utf_char; - char utf_buf[8]; - char *utf_buf_p; - /* - * The charset transformation structure. - FM - */ - UCTransParams T; -}; - -static char replace_buf[64]; /* buffer for replacement strings */ - -static void HTPlain_getChartransInfo(HTStream *me, HTParentAnchor *anchor) -{ - if (me->inUCLYhndl < 0) { - HTAnchor_copyUCInfoStage(anchor, UCT_STAGE_PARSER, UCT_STAGE_MIME, - UCT_SETBY_PARSER); - me->inUCLYhndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_PARSER); - } - if (me->outUCLYhndl < 0) { - int chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT); - - if (chndl < 0) { - chndl = current_char_set; - HTAnchor_setUCInfoStage(anchor, chndl, - UCT_STAGE_HTEXT, UCT_SETBY_DEFAULT); - } - HTAnchor_setUCInfoStage(anchor, chndl, - UCT_STAGE_HTEXT, UCT_SETBY_DEFAULT); - me->outUCLYhndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT); - } - me->inUCI = HTAnchor_getUCInfoStage(anchor, UCT_STAGE_PARSER); - me->outUCI = HTAnchor_getUCInfoStage(anchor, UCT_STAGE_HTEXT); -} - -/* Write the buffer out to the socket - * ---------------------------------- - */ - -/*_________________________________________________________________________ - * - * A C T I O N R O U T I N E S - */ - -static void HTPlain_write(HTStream *me, const char *s, - int l); - -/* Character handling - * ------------------ - */ -static void HTPlain_put_character(HTStream *me, int c) -{ -#ifdef REMOVE_CR_ONLY - /* - * Throw away \r's. - */ - if (c != '\r') { - HText_appendCharacter(me->text, c); - } -#else - /* - * See HTPlain_write() for explanations of the following code (we've been - * called via HTPlain_put_string() to do for each character of a terminated - * string what HTPlain_write() does via a while loop for each character in - * a stream of given length). - FM - */ - if ((HTPlain_lastraw == '\r') && c == '\n') { - HTPlain_lastraw = -1; - return; - } - if (c == '\b' || c == '_' || HTPlain_bs_pending) { - char temp[1]; - - temp[0] = (char) c; - HTPlain_write(me, temp, 1); - return; - } - HTPlain_lastraw = UCH(c); - if (c == '\r') { - HText_appendCharacter(me->text, '\n'); - } else if (TOASCII(UCH(c)) >= 127) { /* S/390 -- gil -- 0305 */ - char temp[1]; - - temp[0] = (char) c; - /* - * For now, don't repeat everything here that has been done below - KW - */ - HTPlain_write(me, temp, 1); - } else if (IS_CJK_TTY) { - HText_appendCharacter(me->text, c); - } else if (TOASCII(UCH(c)) >= 127 && TOASCII(UCH(c)) < 161 && - HTPassHighCtrlRaw) { - HText_appendCharacter(me->text, c); - } else if (UCH(c) == CH_NBSP) { /* S/390 -- gil -- 0341 */ - HText_appendCharacter(me->text, ' '); - } else if (UCH(c) == CH_SHY) { - return; - } else if ((UCH(c) >= ' ' && TOASCII(UCH(c)) < 127) || - c == '\n' || c == '\t') { - HText_appendCharacter(me->text, c); - } else if (TOASCII(UCH(c)) > 160) { - if (!HTPassEightBitRaw && - !((me->outUCLYhndl == LATIN1) || - (me->outUCI->enc & (UCT_CP_SUPERSETOF_LAT1)))) { - int len, high, low, i, diff = 1; - const char *name; - UCode_t value = (UCode_t) FROMASCII((TOASCII(UCH(c)) - 160)); - - name = HTMLGetEntityName(value); - len = (int) strlen(name); - for (low = 0, high = (int) HTML_dtd.number_of_entities; - high > low; - diff < 0 ? (low = i + 1) : (high = i)) { - /* Binary search */ - i = (low + (high - low) / 2); - diff = AS_ncmp(HTML_dtd.entity_names[i], name, (unsigned) len); - if (diff == 0) { - HText_appendText(me->text, - LYCharSets[me->outUCLYhndl][i]); - break; - } - } - if (diff) { - HText_appendCharacter(me->text, c); - } - } else { - HText_appendCharacter(me->text, c); - } - } -#endif /* REMOVE_CR_ONLY */ -} - -/* String handling - * --------------- - * - */ -static void HTPlain_put_string(HTStream *me, const char *s) -{ -#ifdef REMOVE_CR_ONLY - HText_appendText(me->text, s); -#else - const char *p; - - if (s == NULL) - return; - for (p = s; *p; p++) { - HTPlain_put_character(me, *p); - } -#endif /* REMOVE_CR_ONLY */ -} - -/* - * Entry function for displayed text/plain and WWW_SOURCE strings. - FM - * --------------------------------------------------------------- - */ -static void HTPlain_write(HTStream *me, const char *s, int l) -{ - const char *p; - const char *e = s + l; - char c; - unsigned c_unsign; - BOOL chk; - UCode_t code, uck = -1; - char saved_char_in = '\0'; - - for (p = s; p < e; p++) { -#ifdef REMOVE_CR_ONLY - /* - * Append the whole string, but remove any \r's. - FM - */ - if (*p != '\r') { - HText_appendCharacter(me->text, *p); - } -#else - if (*p == '\b') { - if (HTPlain_lastraw >= UCH(' ') && - HTPlain_lastraw != '\r' && HTPlain_lastraw != '\n') { - if (!HTPlain_bs_pending) { - HTPlain_bs_pending = 1; - continue; - } else if (HTPlain_bs_pending == 2) { - HTPlain_bs_pending = 3; - continue; - } - } - if (HTPlain_bs_pending >= 2) - HText_appendCharacter(me->text, '_'); - HTPlain_bs_pending = 0; - } else if (*p == '_') { - if (!HTPlain_bs_pending) { - HTPlain_bs_pending = 2; - HTPlain_lastraw = UCH(*p); - continue; -#if 0 - } else if (HTPlain_bs_pending != 2) { - HTPlain_bs_pending--; /* 1 -> 0, 3 -> 2 */ - HTPlain_lastraw = UCH(*p); - continue; -#endif - } - } - - /* - * Try to handle lone LFs, CRLFs and lone CRs as newline, and to deal - * with control, ASCII, and 8-bit characters based on best guesses of - * what's appropriate. - FM - */ - if ((HTPlain_lastraw == '\r') && *p == '\n') { - HTPlain_lastraw = -1; - continue; - } - - if (HTPlain_bs_pending && - !(UCH(*p) >= ' ' && *p != '\r' && *p != '\n' && - (HTPlain_lastraw == UCH(*p) || - HTPlain_lastraw == UCH('_') || - *p == '_'))) { - if (HTPlain_bs_pending >= 2) - HText_appendCharacter(me->text, '_'); - HTPlain_bs_pending = 0; - } else if (HTPlain_bs_pending == 1) { - HTPlain_bs_pending = 0; - continue; /* ignore last two of "X\bX" or "X\b_" - kw */ - } else if (HTPlain_bs_pending == 3) { - if (*p == '_') { - HTPlain_bs_pending = 2; - continue; /* ignore last two of "_\b_" - kw */ - } else { - HTPlain_bs_pending = 0; - /* ignore first two of "_\bX" - kw */ - } - } else if (HTPlain_bs_pending == 2) { - HText_appendCharacter(me->text, '_'); - if (*p == '_') - continue; /* keep second of "__" pending - kw */ - HTPlain_bs_pending = 0; - } else { - HTPlain_bs_pending = 0; - } - HTPlain_lastraw = UCH(*p); - if (*p == '\r') { - HText_appendCharacter(me->text, '\n'); - continue; - } - /* - * Make sure the character is handled as Unicode whenever that's - * appropriate. - FM - */ - c = *p; - c_unsign = UCH(c); - code = (UCode_t) c_unsign; - saved_char_in = '\0'; - /* - * Combine any UTF-8 multibytes into Unicode to check for special - * characters. - FM - */ - if (me->T.decode_utf8) { - /* - * Combine UTF-8 into Unicode. Incomplete characters silently - * ignored. from Linux kernel's console.c - KW - */ - if (TOASCII(c_unsign) > 127) { /* S/390 -- gil -- 0371 */ - /* - * We have an octet from a multibyte character. - FM - */ - if (me->utf_count > 0 && (c & 0xc0) == 0x80) { - /* - * Adjust the UCode_t value, add the octet to the buffer, - * and decrement the byte count. - FM - */ - me->utf_char = (me->utf_char << 6) | (c & 0x3f); - me->utf_count--; - *(me->utf_buf_p) = c; - (me->utf_buf_p)++; - if (me->utf_count == 0) { - /* - * Got a complete multibyte character. - */ - *(me->utf_buf_p) = '\0'; - code = me->utf_char; - if (code > 0 && code < 256) { - c = FROMASCII((char) code); - c_unsign = UCH(c); - } - } else { - /* - * Get the next byte. - FM - */ - continue; - } - } else { - /* - * Start handling a new multibyte character. - FM - */ - me->utf_buf_p[0] = c; - me->utf_buf_p = &me->utf_buf[1]; - if ((*p & 0xe0) == 0xc0) { - me->utf_count = 1; - me->utf_char = (c & 0x1f); - } else if ((*p & 0xf0) == 0xe0) { - me->utf_count = 2; - me->utf_char = (c & 0x0f); - } else if ((*p & 0xf8) == 0xf0) { - me->utf_count = 3; - me->utf_char = (c & 0x07); - } else if ((*p & 0xfc) == 0xf8) { - me->utf_count = 4; - me->utf_char = (c & 0x03); - } else if ((*p & 0xfe) == 0xfc) { - me->utf_count = 5; - me->utf_char = (c & 0x01); - } else { - /* - * We got garbage, so ignore it. - FM - */ - me->utf_count = 0; - me->utf_buf_p[0] = '\0'; - me->utf_buf_p = me->utf_buf; - } - /* - * Get the next byte. - FM - */ - continue; - } - } else if (me->utf_count > 0) { - /* - * Got an ASCII character when expecting UTF-8 multibytes, so - * ignore the buffered multibye characters and fall through - * with the current ASCII character. - FM - */ - me->utf_count = 0; - me->utf_buf[0] = '\0'; - me->utf_buf_p = me->utf_buf; - code = (UCode_t) c_unsign; - } else { - /* - * Got a valid ASCII character, so fall through with it. - FM - */ - code = (UCode_t) c_unsign; - } - } - /* - * Convert characters from non-UTF-8 charsets to Unicode (if - * appropriate). - FM - */ - if (!(me->T.decode_utf8 && - UCH(*p) > 127)) { -#ifdef NOTDEFINED - if (me->T.strip_raw_char_in) - saved_char_in = c; -#endif /* NOTDEFINED */ - if (me->T.trans_to_uni && - (TOASCII(code) >= LYlowest_eightbit[me->inUCLYhndl] || /* S/390 -- gil -- 0389 */ - (code < ' ' && code != 0 && - me->T.trans_C0_to_uni))) { - /* - * Convert the octet to Unicode. - FM - */ - code = (UCode_t) UCTransToUni(c, me->inUCLYhndl); - if (code > 0) { - saved_char_in = c; - if (code < 256) { - c = FROMASCII((char) code); - c_unsign = UCH(c); - } - } - } else if (code < 32 && code != 0 && - me->T.trans_C0_to_uni) { - /* - * Quote from SGML.c: - * "This else if may be too ugly to keep. - KW" - */ - if (me->T.trans_from_uni && - (((code = UCTransToUni(c, me->inUCLYhndl)) >= 32) || - (me->T.transp && - (code = UCTransToUni(c, me->inUCLYhndl)) > 0))) { - saved_char_in = c; - if (code < 256) { - c = FROMASCII((char) code); - c_unsign = UCH(c); - } - } else { - uck = -1; - if (me->T.transp) { - uck = UCTransCharStr(replace_buf, 60, c, - me->inUCLYhndl, - me->inUCLYhndl, NO); - } - if (!me->T.transp || uck < 0) { - uck = UCTransCharStr(replace_buf, 60, c, - me->inUCLYhndl, - me->outUCLYhndl, YES); - } - if (uck == 0) { - continue; - } else if (uck < 0) { - me->utf_buf[0] = '\0'; - } else { - c = replace_buf[0]; - if (c && replace_buf[1]) { - HText_appendText(me->text, replace_buf); - continue; - } - } - me->utf_buf[0] = '\0'; - code = UCH(c); - } /* Next line end of ugly stuff for C0. - KW */ - } else { - me->utf_buf[0] = '\0'; - code = UCH(c); - } - } - /* - * At this point we have either code in Unicode (and c in latin1 if - * code is in the latin1 range), or code and c will have to be passed - * raw. - */ - - /* - * If CJK mode is on, we'll assume the document matches the user's - * display character set, and if not, the user should toggle off - * raw/CJK mode to reload. - FM - */ - if (IS_CJK_TTY) { - HText_appendCharacter(me->text, c); - -#define PASSHICTRL (me->T.transp || \ - code >= LYlowest_eightbit[me->inUCLYhndl]) -#define PASS8859SPECL me->T.pass_160_173_raw -#define PASSHI8BIT (HTPassEightBitRaw || \ - (me->T.do_8bitraw && !me->T.trans_from_uni)) - /* - * If HTPassHighCtrlRaw is set (e.g., for KOI8-R) assume the - * document matches and pass 127-160 8-bit characters. If it - * doesn't match, the user should toggle raw/CJK mode off. - FM - */ - } else if (TOASCII(code) >= 127 && TOASCII(code) < 161 && /* S/390 -- gil -- 0427 */ - PASSHICTRL && PASS8859SPECL) { - HText_appendCharacter(me->text, c); - } else if (code == CH_SHY && PASS8859SPECL) { - HText_appendCharacter(me->text, c); - /* - * If neither HTPassHighCtrlRaw nor CJK is set, play it safe and - * treat 160 (nbsp) as an ASCII space (32). - FM - */ - } else if (code == CH_NBSP) { - HText_appendCharacter(me->text, ' '); - /* - * If neither HTPassHighCtrlRaw nor CJK is set, play it safe and - * ignore 173 (shy). - FM - * Now only ignore it for color style, which doesn't handle it - * anyway. Otherwise pass it on as LY_SOFT_HYPHEN and let HText - * deal with it. It should be either ignored, or displayed as a - * hyphen if it was indeed at the end of a line. Well it should. - * - kw - */ - } else if (code == CH_SHY) { -#ifndef USE_COLOR_STYLE - HText_appendCharacter(me->text, LY_SOFT_HYPHEN); -#endif - continue; - /* - * If we get to here, pass the displayable ASCII characters. - FM - */ - } else if ((code >= ' ' && TOASCII(code) < 127) || - (PASSHI8BIT && - c >= LYlowest_eightbit[me->outUCLYhndl]) || - *p == '\n' || *p == '\t') { - HText_appendCharacter(me->text, c); - /* - * Use an ASCII space (32) for ensp, emsp or thinsp. - FM - */ - } else if (code == 8194 || code == 8195 || code == 8201) { - HText_appendCharacter(me->text, ' '); - /* - * If we want the raw character, pass it now. - FM - */ - } else if (me->T.use_raw_char_in && saved_char_in) { - HText_appendCharacter(me->text, saved_char_in); -/****************************************************************** - * I. LATIN-1 OR UCS2 TO DISPLAY CHARSET - ******************************************************************/ - } else if ((chk = (BOOL) (me->T.trans_from_uni && code >= 160)) && - (uck = UCTransUniChar(code, - me->outUCLYhndl)) >= ' ' && /* S/390 -- gil -- 0464 */ - uck < 256) { - CTRACE((tfp, "UCTransUniChar returned 0x%.2" PRI_UCode_t - ":'%c'.\n", - uck, FROMASCII(UCH(uck)))); - HText_appendCharacter(me->text, ((char) (uck & 0xff))); - } else if (chk && - (uck == -4 || - (me->T.repl_translated_C0 && uck > 0 && uck < ' ')) && /* S/390 -- gil -- 0481 */ - /* - * Not found; look for replacement string. - */ - (uck = UCTransUniCharStr(replace_buf, 60, code, - me->outUCLYhndl, 0) >= 0)) { - /* - * No further tests for valididy - assume that whoever defined - * replacement strings knew what she was doing. - */ - HText_appendText(me->text, replace_buf); - /* - * If we get to here, and should have translated, translation has - * failed so far. - */ - } else if (chk && TOASCII(code) > 127 && me->T.output_utf8) { /* S/390 -- gil -- 0498 */ - /* - * We want UTF-8 output, so do it now. - FM - */ - if (*me->utf_buf) { - HText_appendText(me->text, me->utf_buf); - me->utf_buf[0] = '\0'; - me->utf_buf_p = me->utf_buf; - } else if (UCConvertUniToUtf8(code, replace_buf)) { - HText_appendText(me->text, replace_buf); - } else { - /* - * Out of luck, so use the UHHH notation (ugh). - gil - */ - /* S/390 -- gil -- 0517 */ - sprintf(replace_buf, "U%.2lX", (unsigned long) TOASCII(code)); - HText_appendText(me->text, replace_buf); - } -#ifdef NOTDEFINED - } else if (me->T.strip_raw_char_in && - UCH(*p) >= 192 && - UCH(*p) < 255) { - /* - * KOI special: strip high bit, gives (somewhat) readable ASCII. - */ - HText_appendCharacter(me->text, (char) (*p & 0x7f)); -#endif /* NOTDEFINED */ - /* - * If we don't actually want the character, make it safe and output - * that now. - FM - */ - } else if ((c_unsign > 0 && - (int) c_unsign < LYlowest_eightbit[me->outUCLYhndl]) || - (me->T.trans_from_uni && !HTPassEightBitRaw)) { - /* - * If we do not have the "7-bit approximations" as our output - * character set (in which case we did it already) seek a - * translation for that. Otherwise, or if the translation fails, - * use UHHH notation. - FM - */ - if ((chk = (BOOL) (me->outUCLYhndl != - UCGetLYhndl_byMIME("us-ascii"))) && - (uck = UCTransUniChar(code, - UCGetLYhndl_byMIME("us-ascii"))) - >= ' ' && TOASCII(uck) < 127) { /* S/390 -- gil -- 0535 */ - /* - * Got an ASCII character (yippey). - FM - */ - c = FROMASCII((char) uck); - HText_appendCharacter(me->text, c); - } else if ((chk && uck == -4) && - (uck = UCTransUniCharStr(replace_buf, - 60, code, - UCGetLYhndl_byMIME("us-ascii"), - 0) >= 0)) { - /* - * Got a repacement string (yippey). - FM - */ - HText_appendText(me->text, replace_buf); - } else if (code == 8204 || code == 8205) { - /* - * Ignore 8204 (zwnj) or 8205 (zwj), if we get to here. - FM - */ - CTRACE((tfp, "HTPlain_write: Ignoring '%" PRI_UCode_t "'.\n", code)); - } else if (code == 8206 || code == 8207) { - /* - * Ignore 8206 (lrm) or 8207 (rlm), if we get to here. - FM - */ - CTRACE((tfp, "HTPlain_write: Ignoring '%" PRI_UCode_t "'.\n", code)); - } else { - /* - * Out of luck, so use the UHHH notation (ugh). - FM - */ - /* do not print UHHH for now - sprintf(replace_buf, "U%.2lX", code); - HText_appendText(me->text, replace_buf); - */ - } - /* - * If we get to here and have a monobyte character, pass it. - FM - */ - } else if (c_unsign != 0 && c_unsign < 256) { - HText_appendCharacter(me->text, c); - } -#endif /* REMOVE_CR_ONLY */ - } -} - -/* Free an HTML object - * ------------------- - * - * Note that the SGML parsing context is freed, but the created object is - * not, as it takes on an existence of its own unless explicitly freed. - */ -static void HTPlain_free(HTStream *me) -{ - if (HTPlain_bs_pending >= 2) - HText_appendCharacter(me->text, '_'); - FREE(me); -} - -/* End writing -*/ -static void HTPlain_abort(HTStream *me, HTError e GCC_UNUSED) -{ - HTPlain_free(me); -} - -/* Structured Object Class - * ----------------------- - */ -static const HTStreamClass HTPlain = -{ - "PlainPresenter", - HTPlain_free, - HTPlain_abort, - HTPlain_put_character, HTPlain_put_string, HTPlain_write, -}; - -/* New object - * ---------- - */ -HTStream *HTPlainPresent(HTPresentation *pres GCC_UNUSED, HTParentAnchor *anchor, - HTStream *sink GCC_UNUSED) -{ - - HTStream *me = (HTStream *) malloc(sizeof(*me)); - - if (me == NULL) - outofmem(__FILE__, "HTPlain_new"); - - assert(me != NULL); - - me->isa = &HTPlain; - - HTPlain_lastraw = -1; - - me->utf_count = 0; - me->utf_char = 0; - me->utf_buf[0] = me->utf_buf[6] = me->utf_buf[7] = '\0'; - me->utf_buf_p = me->utf_buf; - me->outUCLYhndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT); - me->inUCLYhndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_PARSER); - HTPlain_getChartransInfo(me, anchor); - UCSetTransParams(&me->T, - me->inUCLYhndl, me->inUCI, - me->outUCLYhndl, - HTAnchor_getUCInfoStage(anchor, UCT_STAGE_HTEXT)); - - me->text = HText_new(anchor); - HText_setStyle(me->text, LYstyles(HTML_XMP)); - HText_beginAppend(me->text); - - return (HTStream *) me; -} |