about summary refs log tree commit diff stats
path: root/WWW/Library/Implementation/SGML.c
diff options
context:
space:
mode:
authorThomas E. Dickey <dickey@invisible-island.net>2009-03-11 00:40:00 -0400
committerThomas E. Dickey <dickey@invisible-island.net>2009-03-11 00:40:00 -0400
commit089dd372933a775165f70ae0b32713eb3aebee98 (patch)
tree1ad37766e42512ccb3f397d6217dbf49495c1285 /WWW/Library/Implementation/SGML.c
parentf71b2b3a35fa25f2be897b75870d3ab81170b857 (diff)
downloadlynx-snapshots-089dd372933a775165f70ae0b32713eb3aebee98.tar.gz
snapshot of project "lynx", label v2-8-7dev_13b
Diffstat (limited to 'WWW/Library/Implementation/SGML.c')
-rw-r--r--WWW/Library/Implementation/SGML.c69
1 files changed, 59 insertions, 10 deletions
diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c
index 709586e5..5fd3a09e 100644
--- a/WWW/Library/Implementation/SGML.c
+++ b/WWW/Library/Implementation/SGML.c
@@ -1,5 +1,5 @@
 /*
- * $LynxId: SGML.c,v 1.120 2009/01/03 01:12:28 tom Exp $
+ * $LynxId: SGML.c,v 1.122 2009/03/10 21:16:57 tom Exp $
  *
  *			General SGML Parser code		SGML.c
  *			========================
@@ -4382,12 +4382,23 @@ static void SGML_character(HTStream *context, char c_in)
     }
 }				/* SGML_character */
 
-static void SGML_string(HTStream *context, const char *str)
+static void InferUtfFromBom(HTStream *context, int chndl)
 {
-    const char *p;
+    HTAnchor_setUCInfoStage(context->node_anchor, chndl,
+			    UCT_STAGE_PARSER,
+			    UCT_SETBY_PARSER);
+    change_chartrans_handling(context);
+}
 
-    for (p = str; *p; p++)
-	SGML_character(context, *p);
+/*
+ * Avoid rewrite of SGML_character() to handle hypothetical case of UTF-16
+ * webpages, by pretending that the data is UTF-8.
+ */
+static void SGML_widechar(HTStream *context, long ch)
+{
+    if (!UCPutUtf8_charstring(context, SGML_character, ch)) {
+	SGML_character(context, UCH(ch));
+    }
 }
 
 static void SGML_write(HTStream *context, const char *str, int l)
@@ -4395,8 +4406,45 @@ static void SGML_write(HTStream *context, const char *str, int l)
     const char *p;
     const char *e = str + l;
 
-    for (p = str; p < e; p++)
-	SGML_character(context, *p);
+    if (sgml_offset == 0) {
+	if (l > 3
+	    && !memcmp(str, "\357\273\277", 3)) {
+	    CTRACE((tfp, "SGML_write found UTF-8 BOM\n"));
+	    InferUtfFromBom(context, UTF8_handle);
+	    str += 3;
+	} else if (l > 2) {
+	    if (!memcmp(str, "\377\376", 2)) {
+		CTRACE((tfp, "SGML_write found UCS-2 LE BOM\n"));
+		InferUtfFromBom(context, UTF8_handle);
+		str += 2;
+		context->T.ucs_mode = -1;
+	    } else if (!memcmp(str, "\376\377", 2)) {
+		CTRACE((tfp, "SGML_write found UCS-2 BE BOM\n"));
+		InferUtfFromBom(context, UTF8_handle);
+		str += 2;
+		context->T.ucs_mode = 1;
+	    }
+	}
+    }
+    switch (context->T.ucs_mode) {
+    case -1:
+	for (p = str; p < e; p += 2)
+	    SGML_widechar(context, (UCH(p[1]) << 8) | UCH(p[0]));
+	break;
+    case 1:
+	for (p = str; p < e; p += 2)
+	    SGML_widechar(context, (UCH(p[0]) << 8) | UCH(p[1]));
+	break;
+    default:
+	for (p = str; p < e; p++)
+	    SGML_character(context, *p);
+	break;
+    }
+}
+
+static void SGML_string(HTStream *context, const char *str)
+{
+    SGML_write(context, str, strlen(str));
 }
 
 /*_______________________________________________________________________
@@ -4507,11 +4555,12 @@ HTStream *SGML_new(const SGML_dtd * dtd,
  */
 int SGML_offset(void)
 {
+    int result = sgml_offset;
+
 #ifdef USE_PRETTYSRC
-    return sgml_offset + psrc_view;
-#else
-    return sgml_offset;
+    result += psrc_view;
 #endif
+    return result;
 }
 
 /*		Asian character conversion functions