diff options
author | Thomas E. Dickey <dickey@invisible-island.net> | 1998-03-04 19:00:00 -0500 |
---|---|---|
committer | Thomas E. Dickey <dickey@invisible-island.net> | 1998-03-04 19:00:00 -0500 |
commit | e9b52cbfe84bc9e13568e784836c9e0b4b1e0913 (patch) | |
tree | c3174e1d8d535e0e82e22dfad2bb803bef288cdd /src | |
parent | 349da2fb30fd6d2be4bd47a95fee9915b50f6d67 (diff) | |
download | lynx-snapshots-e9b52cbfe84bc9e13568e784836c9e0b4b1e0913.tar.gz |
snapshot of project "lynx", label v2-8pre_3
Diffstat (limited to 'src')
-rw-r--r-- | src/HTML.c | 2 | ||||
-rw-r--r-- | src/HTML.h | 3 | ||||
-rw-r--r-- | src/LYCharUtils.c | 52 | ||||
-rw-r--r-- | src/chrtrans/README.format | 9 | ||||
-rw-r--r-- | src/chrtrans/cp1251_uni.tbl | 2 | ||||
-rw-r--r-- | src/chrtrans/cp437_uni.tbl | 130 | ||||
-rw-r--r-- | src/chrtrans/cp850_uni.tbl | 66 | ||||
-rw-r--r-- | src/chrtrans/def7_uni.tbl | 94 |
8 files changed, 130 insertions, 228 deletions
diff --git a/src/HTML.c b/src/HTML.c index 8d5c705b..14a11eff 100644 --- a/src/HTML.c +++ b/src/HTML.c @@ -6334,7 +6334,7 @@ End_Object: * SGML unescape any character references in TEXTAREA * content, then parse it into individual lines * to be handled as a series of INPUT fields (ugh!). - * Any raw 8-bit or multibye characters already have been + * Any raw 8-bit or multibyte characters already have been * handled in relation to the display character set * in SGML_character(). */ diff --git a/src/HTML.h b/src/HTML.h index 5a642367..7cb4adfe 100644 --- a/src/HTML.h +++ b/src/HTML.h @@ -26,9 +26,6 @@ #define TRANSLATE_AND_UNESCAPE_ENTITIES(s, p, h) \ LYUCFullyTranslateString(s, ATTR_CS_IN, current_char_set, YES, p, h, st_HTML) -#define TRANSLATE_AND_UNESCAPE_ENTITIES4(s, cs_to, p, h) \ - LYUCFullyTranslateString(s, ATTR_CS_IN, cs_to, YES, p, h, st_HTML) /* not used */ - #define TRANSLATE_AND_UNESCAPE_ENTITIES5(s,cs_from,cs_to,p,h) \ LYUCFullyTranslateString(s, cs_from, cs_to, YES, p, h, st_HTML) diff --git a/src/LYCharUtils.c b/src/LYCharUtils.c index 33e775b0..7055814f 100644 --- a/src/LYCharUtils.c +++ b/src/LYCharUtils.c @@ -413,6 +413,11 @@ PUBLIC void LYFillLocalFileURL ARGS2( ** The META tag is not written if the display character set (passed as ** disp_chndl) already corresponds to the charset assumption that ** would be made when the file is read. - KW +** +** Currently this function used for temporary files like "Lynx Info Page" +** and for one permanent - bookmarks (so may be a problem if you change +** display charset later: new bookmark entries may be wrongly interpreted). +** - LP */ PUBLIC void LYAddMETAcharsetToFD ARGS2( FILE *, fd, @@ -1513,16 +1518,33 @@ PRIVATE char * UCPutUtf8ToBuffer ARGS3(char *, q, UCode_t, code, BOOL, terminate PRIVATE char *hex = "0123456789ABCDEF"; /* -** This function translates a string from charset -** `cs_from' to charset `cs_to', reallocating it if necessary. -** If `do_ent' is YES, it also converts HTML named entities -** and numeric character references (NCRs) to their `cs_to' -** replacements. + * Any raw 8-bit or multibyte characters already have been + * handled in relation to the display character set + * in SGML_character(), including named and numeric entities. + * +** This function used for translations HTML special fields inside tags +** (ALT=, VALUE=, etc.) from charset `cs_from' to charset `cs_to'. +** It also unescapes non-ASCII characters from URL (#fragments !) +** if st_URL is active. +** +** If `do_ent' is YES, it converts named entities +** and numeric character references (NCRs) to their `cs_to' replacements. +** +** Named entities converted to unicodes. NCRs (unicodes) converted +** by UCdomap.c chartrans functions. +** ???NCRs with values in the ISO-8859-1 range 160-255 may be converted +** to their HTML entity names (via old-style entities) and then translated +** according to the LYCharSets.c array for `cs_out'???. +** +** Some characters (see descriptions in `put_special_unicodes' from SGML.c) +** translated in relation with the state of boolean variables +** `use_lynx_specials', `plain_space' and `hidden'. It is not clear yet: +** ** If plain_space is TRUE, nbsp (160) will be treated as an ASCII ** space (32). If hidden is TRUE, entities will be translated ** (if `do_ent' is YES) but escape sequences will be passed unaltered. ** If `hidden' is FALSE, some characters are converted to Lynx special -** codes (160, 173, .. @@ need list @@) (or ASCII space if `plain_space' +** codes (see `put_special_unicodes') or ASCII space if `plain_space' ** applies). @@ is `use_lynx_specials' needed, does it have any effect? @@ ** If `use_lynx_specials' is YES, translate byte values 160 and 173 ** meaning U+00A0 and U+00AD given as or converted from raw char input @@ -1536,15 +1558,6 @@ PRIVATE char *hex = "0123456789ABCDEF"; ** If `Back' is YES, an attempt is made to use UCReverseTransChar() for ** back translation which may be more efficient. (?) ** -** Named entities may be converted to their translations in the -** active LYCharSets.c array for `cs_out' or looked up as a Unicode -** value which is then passed to the chartrans functions (see UCdomap.c). -** @@ order? @@ -** NCRs with values in the ISO-8859-1 range 160-255 may be converted -** to their HTML entity names and then translated according to the -** LYCharSets.c array for `cs_out', in general NCRs are translated -** by UCdomap.c chartrans functions if necessary. -** ** If `stype' is st_URL, non-ASCII characters are URL-encoded instead. ** The sequence of bytes being URL-encoded is the raw input character if ** we couldn't translate it from `cs_in' (CJK etc.); otherwise it is the @@ -1560,8 +1573,11 @@ PRIVATE char *hex = "0123456789ABCDEF"; ** - dropped if `stype' is st_other, otherwise (i.e. st_HTML) ** - passed if `hidden' is TRUE or HTCJK is set, otherwise ** - dropped. -*/ -/* +** +** (If `stype' is st_URL or st_other most of the parameters really predefined: +** cs_from=cs_to, use_lynx_specials=plain_space=NO, and hidden=YES) +** +** ** Returns pointer to the char** passed in ** if string translated or translation unnecessary, ** NULL otherwise @@ -2204,7 +2220,7 @@ PRIVATE char ** LYUCFullyTranslateString_1 ARGS9( } /* ** Didn't find the entity. - ** Return to screen verbatim. + ** Return verbatim. */ state = S_recover; break; diff --git a/src/chrtrans/README.format b/src/chrtrans/README.format index 7afc1c68..4ced0a14 100644 --- a/src/chrtrans/README.format +++ b/src/chrtrans/README.format @@ -120,4 +120,11 @@ Motivation: - The format is independent of details of other parts of the Lynx code, unlike the "old" LYCharsets.c mechanism. The tables don't have to - be changed in synch when e.g. new entities are added to the HTMLDTD. + be changed in synch when e.g. new entities are added to the entities.h. + + +Note: the Default "7bit approximation" table can be used for +case-insensitive search for non-ascii letters if no upper/lower case +information provided by other means, e.g. locale. It is assumed that +upper/lower case letters have their "7bit approximation" images +in def7_uni.tbl matched case-insensitively. diff --git a/src/chrtrans/cp1251_uni.tbl b/src/chrtrans/cp1251_uni.tbl index 21a44414..e9bb9460 100644 --- a/src/chrtrans/cp1251_uni.tbl +++ b/src/chrtrans/cp1251_uni.tbl @@ -21,7 +21,9 @@ OWinCyrillic (cp1251) # # The entries are in cp1251_WinCyrillic order # +# 0x20-0x7f idem +# 0x80 U+0402 #CYRILLIC CAPITAL LETTER DJE 0x81 U+0403 #CYRILLIC CAPITAL LETTER GJE 0x82 U+201A #SINGLE LOW-9 QUOTATION MARK diff --git a/src/chrtrans/cp437_uni.tbl b/src/chrtrans/cp437_uni.tbl index ad8d9940..621e730e 100644 --- a/src/chrtrans/cp437_uni.tbl +++ b/src/chrtrans/cp437_uni.tbl @@ -27,134 +27,8 @@ ODosLatinUS (cp437) # some mapppings of greek letters to latin letters added, # just for fun.. - KW # -0x00 U+0000 #NULL -0x01 U+0001 #START OF HEADING -0x02 U+0002 #START OF TEXT -0x03 U+0003 #END OF TEXT -0x04 U+0004 #END OF TRANSMISSION -0x05 U+0005 #ENQUIRY -0x06 U+0006 #ACKNOWLEDGE -0x07 U+0007 #BELL -0x08 U+0008 #BACKSPACE -0x09 U+0009 #HORIZONTAL TABULATION -0x0a U+000a #LINE FEED -0x0b U+000b #VERTICAL TABULATION -0x0c U+000c #FORM FEED -0x0d U+000d #CARRIAGE RETURN -0x0e U+000e #SHIFT OUT -0x0f U+000f #SHIFT IN -0x10 U+0010 #DATA LINK ESCAPE -0x11 U+0011 #DEVICE CONTROL ONE -0x12 U+0012 #DEVICE CONTROL TWO -0x13 U+0013 #DEVICE CONTROL THREE -0x14 U+0014 U+03a0 #DEVICE CONTROL FOUR -0x15 U+0015 #NEGATIVE ACKNOWLEDGE -0x16 U+0016 #SYNCHRONOUS IDLE -0x17 U+0017 #END OF TRANSMISSION BLOCK -0x18 U+0018 #CANCEL -0x19 U+0019 #END OF MEDIUM -0x1a U+001a #SUBSTITUTE -0x1b U+001b #ESCAPE -0x1c U+001c #FILE SEPARATOR -0x1d U+001d #GROUP SEPARATOR -0x1e U+001e #RECORD SEPARATOR -0x1f U+001f #UNIT SEPARATOR -0x20 U+0020 #SPACE -0x21 U+0021 #EXCLAMATION MARK -0x22 U+0022 #QUOTATION MARK -0x23 U+0023 #NUMBER SIGN -0x24 U+0024 #DOLLAR SIGN -0x25 U+0025 #PERCENT SIGN -0x26 U+0026 #AMPERSAND -0x27 U+0027 #APOSTROPHE -0x28 U+0028 #LEFT PARENTHESIS -0x29 U+0029 #RIGHT PARENTHESIS -0x2a U+002a #ASTERISK -0x2b U+002b #PLUS SIGN -0x2c U+002c #COMMA -0x2d U+002d #HYPHEN-MINUS -0x2e U+002e #FULL STOP -0x2f U+002f #SOLIDUS -0x30 U+0030 #DIGIT ZERO -0x31 U+0031 #DIGIT ONE -0x32 U+0032 #DIGIT TWO -0x33 U+0033 #DIGIT THREE -0x34 U+0034 #DIGIT FOUR -0x35 U+0035 #DIGIT FIVE -0x36 U+0036 #DIGIT SIX -0x37 U+0037 #DIGIT SEVEN -0x38 U+0038 #DIGIT EIGHT -0x39 U+0039 #DIGIT NINE -0x3a U+003a #COLON -0x3b U+003b #SEMICOLON -0x3c U+003c #LESS-THAN SIGN -0x3d U+003d #EQUALS SIGN -0x3e U+003e #GREATER-THAN SIGN -0x3f U+003f #QUESTION MARK -0x40 U+0040 #COMMERCIAL AT -0x41 U+0041 U+0391 #LATIN CAPITAL LETTER A -0x42 U+0042 U+0392 #LATIN CAPITAL LETTER B -0x43 U+0043 #LATIN CAPITAL LETTER C -0x44 U+0044 #LATIN CAPITAL LETTER D -0x45 U+0045 U+0395 #LATIN CAPITAL LETTER E -0x46 U+0046 #LATIN CAPITAL LETTER F -0x47 U+0047 #LATIN CAPITAL LETTER G -0x48 U+0048 U+0397 #LATIN CAPITAL LETTER H -0x49 U+0049 U+0399 #LATIN CAPITAL LETTER I -0x4a U+004a #LATIN CAPITAL LETTER J -0x4b U+004b U+039a #LATIN CAPITAL LETTER K -0x4c U+004c #LATIN CAPITAL LETTER L -0x4d U+004d U+039c #LATIN CAPITAL LETTER M -0x4e U+004e U+039d #LATIN CAPITAL LETTER N -0x4f U+004f U+039f #LATIN CAPITAL LETTER O -0x50 U+0050 U+03a1 #LATIN CAPITAL LETTER P -0x51 U+0051 #LATIN CAPITAL LETTER Q -0x52 U+0052 #LATIN CAPITAL LETTER R -0x53 U+0053 #LATIN CAPITAL LETTER S -0x54 U+0054 U+03a4 #LATIN CAPITAL LETTER T -0x55 U+0055 #LATIN CAPITAL LETTER U -0x56 U+0056 #LATIN CAPITAL LETTER V -0x57 U+0057 #LATIN CAPITAL LETTER W -0x58 U+0058 U+03a7 #LATIN CAPITAL LETTER X -0x59 U+0059 U+03a5 #LATIN CAPITAL LETTER Y -0x5a U+005a U+0396 #LATIN CAPITAL LETTER Z -0x5b U+005b #LEFT SQUARE BRACKET -0x5c U+005c #REVERSE SOLIDUS -0x5d U+005d #RIGHT SQUARE BRACKET -0x5e U+005e #CIRCUMFLEX ACCENT -0x5f U+005f #LOW LINE -0x60 U+0060 #GRAVE ACCENT -0x61 U+0061 #LATIN SMALL LETTER A -0x62 U+0062 #LATIN SMALL LETTER B -0x63 U+0063 #LATIN SMALL LETTER C -0x64 U+0064 #LATIN SMALL LETTER D -0x65 U+0065 #LATIN SMALL LETTER E -0x66 U+0066 #LATIN SMALL LETTER F -0x67 U+0067 U+03b3 #LATIN SMALL LETTER G -0x68 U+0068 U+03b7 #LATIN SMALL LETTER H -0x69 U+0069 U+03b9 #LATIN SMALL LETTER I -0x6a U+006a #LATIN SMALL LETTER J -0x6b U+006b U+03ba #LATIN SMALL LETTER K -0x6c U+006c U+03bb #LATIN SMALL LETTER L -0x6d U+006d #LATIN SMALL LETTER M -0x6e U+006e #LATIN SMALL LETTER N -0x6f U+006f U+03bf #LATIN SMALL LETTER O -0x70 U+0070 U+03c1 #LATIN SMALL LETTER P -0x71 U+0071 #LATIN SMALL LETTER Q -0x72 U+0072 #LATIN SMALL LETTER R -0x73 U+0073 U+03c2 #LATIN SMALL LETTER S -0x74 U+0074 #LATIN SMALL LETTER T -0x75 U+0075 U+03c5 #LATIN SMALL LETTER U -0x76 U+0076 U+03bd #LATIN SMALL LETTER V -0x77 U+0077 U+03c9 #LATIN SMALL LETTER W -0x78 U+0078 U+03c7 #LATIN SMALL LETTER X -0x79 U+0079 #LATIN SMALL LETTER Y -0x7a U+007a U+03b6 #LATIN SMALL LETTER Z -0x7b U+007b #LEFT CURLY BRACKET -0x7c U+007c #VERTICAL LINE -0x7d U+007d #RIGHT CURLY BRACKET -0x7e U+007e #TILDE -0x7f U+007f #DELETE +0x20-0x7f idem +# 0x80 U+00c7 #LATIN CAPITAL LETTER C WITH CEDILLA 0x81 U+00fc U+03cb #LATIN SMALL LETTER U WITH DIAERESIS 0x82 U+00e9 #LATIN SMALL LETTER E WITH ACUTE diff --git a/src/chrtrans/cp850_uni.tbl b/src/chrtrans/cp850_uni.tbl index 8a191fb7..05685971 100644 --- a/src/chrtrans/cp850_uni.tbl +++ b/src/chrtrans/cp850_uni.tbl @@ -4,7 +4,7 @@ #but there has to be exactly one table marked as "default". D0 # -#The MIME name of this charset. +#The MIME name of this charset. Mcp850 #Name as a Display Charset (used on Options screen) @@ -27,38 +27,38 @@ ODosLatin1 (cp850) # # The entries are in cp850_DOSLatin1 order # -0x00 U+0000 #NULL -0x01 U+0001 #START OF HEADING -0x02 U+0002 #START OF TEXT -0x03 U+0003 #END OF TEXT -0x04 U+0004 #END OF TRANSMISSION -0x05 U+0005 #ENQUIRY -0x06 U+0006 #ACKNOWLEDGE -0x07 U+0007 #BELL -0x08 U+0008 #BACKSPACE -0x09 U+0009 #HORIZONTAL TABULATION -0x0a U+000a #LINE FEED -0x0b U+000b #VERTICAL TABULATION -0x0c U+000c #FORM FEED -0x0d U+000d #CARRIAGE RETURN -0x0e U+000e #SHIFT OUT -0x0f U+000f #SHIFT IN -0x10 U+0010 #DATA LINK ESCAPE -0x11 U+0011 #DEVICE CONTROL ONE -0x12 U+0012 #DEVICE CONTROL TWO -0x13 U+0013 #DEVICE CONTROL THREE -0x14 U+0014 #DEVICE CONTROL FOUR -0x15 U+0015 #NEGATIVE ACKNOWLEDGE -0x16 U+0016 #SYNCHRONOUS IDLE -0x17 U+0017 #END OF TRANSMISSION BLOCK -0x18 U+0018 #CANCEL -0x19 U+0019 #END OF MEDIUM -0x1a U+001a #SUBSTITUTE -0x1b U+001b #ESCAPE -0x1c U+001c #FILE SEPARATOR -0x1d U+001d #GROUP SEPARATOR -0x1e U+001e #RECORD SEPARATOR -0x1f U+001f #UNIT SEPARATOR +#0x00 U+0000 #NULL +#0x01 U+0001 #START OF HEADING +#0x02 U+0002 #START OF TEXT +#0x03 U+0003 #END OF TEXT +#0x04 U+0004 #END OF TRANSMISSION +#0x05 U+0005 #ENQUIRY +#0x06 U+0006 #ACKNOWLEDGE +#0x07 U+0007 #BELL +#0x08 U+0008 #BACKSPACE +#0x09 U+0009 #HORIZONTAL TABULATION +#0x0a U+000a #LINE FEED +#0x0b U+000b #VERTICAL TABULATION +#0x0c U+000c #FORM FEED +#0x0d U+000d #CARRIAGE RETURN +#0x0e U+000e #SHIFT OUT +#0x0f U+000f #SHIFT IN +#0x10 U+0010 #DATA LINK ESCAPE +#0x11 U+0011 #DEVICE CONTROL ONE +#0x12 U+0012 #DEVICE CONTROL TWO +#0x13 U+0013 #DEVICE CONTROL THREE +#0x14 U+0014 #DEVICE CONTROL FOUR +#0x15 U+0015 #NEGATIVE ACKNOWLEDGE +#0x16 U+0016 #SYNCHRONOUS IDLE +#0x17 U+0017 #END OF TRANSMISSION BLOCK +#0x18 U+0018 #CANCEL +#0x19 U+0019 #END OF MEDIUM +#0x1a U+001a #SUBSTITUTE +#0x1b U+001b #ESCAPE +#0x1c U+001c #FILE SEPARATOR +#0x1d U+001d #GROUP SEPARATOR +#0x1e U+001e #RECORD SEPARATOR +#0x1f U+001f #UNIT SEPARATOR 0x20 U+0020 #SPACE 0x21 U+0021 #EXCLAMATION MARK 0x22 U+0022 #QUOTATION MARK diff --git a/src/chrtrans/def7_uni.tbl b/src/chrtrans/def7_uni.tbl index 0c86d234..66a63f76 100644 --- a/src/chrtrans/def7_uni.tbl +++ b/src/chrtrans/def7_uni.tbl @@ -92,6 +92,7 @@ U+00fc:u: 0x79 U+00fd U+00fe:th 0x79 U+00ff +# end of latin-1 repertoire 0x41 U+0100 U+0102 U+0104 # A 0x61 U+0101 U+0103 U+0105 # a 0x43 U+0106 U+0108 U+010a U+010c # C @@ -243,7 +244,8 @@ U+0217:u) # Linkname: FAQ: Representing IPA Phonetics in ASCII # URL: http://www.hpl.hp.com/personal/Evan_Kirshenbaum/IPA/faq.html # (corrected in Russian Cyrillic area). -# +# (corrected in Greek area). +# 0x41 U+0251 # LATIN SMALL LETTER SCRIPT A -> A U+0252:A. U+0253:b` @@ -344,6 +346,7 @@ U+037a:j3 U+037e:?% U+0384:'* U+0385:'% +# Greek letters U+0386:A% U+0387:.* U+0388:E% @@ -355,69 +358,70 @@ U+038f:W% U+0390:i3 U+0391:A U+0392:B -U+0393:G* -U+0394:D* +U+0393:G +U+0394:D U+0395:E U+0396:Z -U+0397:Y* -U+0398:H* +U+0397:Y +U+0398:TH U+0399:I U+039a:K -U+039b:L* +U+039b:L U+039c:M U+039d:N -U+039e:C* +U+039e:C U+039f:O -U+03a0:P* -U+03a1:R* -U+03a3:S* +U+03a0:P +U+03a1:R +U+03a3:S U+03a4:T -U+03a5:U* -U+03a6:F* -U+03a7:X* -U+03a8:Q* -U+03a9:W* -U+03aa:J* +U+03a5:U +U+03a6:F +U+03a7:X +U+03a8:Q +U+03a9:W +U+03aa:J U+03ab:V* U+03ac:a% U+03ad:e% U+03ae:y% U+03af:i% U+03b0:u3 -U+03b1:a* -U+03b2:b* -U+03b3:g* -U+03b4:d* -U+03b5:e* -U+03b6:z* -U+03b7:y* -U+03b8:h* -U+03b9:i* -U+03ba:k* -U+03bb:l* -U+03bc:m* -U+03bd:n* -U+03be:c* +U+03b1:a +U+03b2:b +U+03b3:g +U+03b4:d +U+03b5:e +U+03b6:z +U+03b7:y +U+03b8:th +U+03b9:i +U+03ba:k +U+03bb:l +U+03bc:m +U+03bd:n +U+03be:c U+03bf:o -U+03c0:p* -U+03c1:r* +U+03c0:p +U+03c1:r U+03c2:*s -U+03c3:s* -U+03c4:t* -U+03c5:u* -U+03c6:f* -U+03c7:x* -U+03c8:q* -U+03c9:w* -U+03ca:j* +U+03c3:s +U+03c4:t +U+03c5:u +U+03c6:f +U+03c7:x +U+03c8:q +U+03c9:w +U+03ca:j U+03cb:v* U+03cc:o% U+03cd:u% U+03ce:w% -U+03d0:b3 +# Greek symbols +U+03d0:beta U+03d1:theta -U+03d2:upsi -U+03d5:phi +U+03d2:upsi +U+03d5:phi U+03d6:pi U+03da:T3 U+03db:t3 @@ -427,7 +431,7 @@ U+03de:K3 U+03df:k3 U+03e0:P3 U+03e1:p3 -U+03f0:kappa +U+03f0:kappa U+03f1:rho U+03f4:'% U+03f5:j3 @@ -1276,6 +1280,7 @@ U+1fdf:?; U+1fed:!: U+1fef:!* U+1ffe:;; +# General punctuation: 0x20 U+2000 U+2002 U+2004-U+2009 # spaces U+2001: U+2003: @@ -1315,6 +1320,7 @@ U+203c:!! U+203e:'- 0x2d U+2043 # HYPHEN BULLET ? U+2044:/ +# end of General punctuation. U+2070:^0 U+2074:^4 U+2075:^5 |