diff options
author | Thomas E. Dickey <dickey@invisible-island.net> | 1996-09-02 19:39:24 -0400 |
---|---|---|
committer | Thomas E. Dickey <dickey@invisible-island.net> | 1996-09-02 19:39:24 -0400 |
commit | e087f6d44e87f489fcb3056e86319ebba4218156 (patch) | |
tree | d045b58011bfbbf5186d34c4fed9e0dedb363275 /WWW/Library/Implementation/SGML.c | |
download | lynx-snapshots-e087f6d44e87f489fcb3056e86319ebba4218156.tar.gz |
snapshot of project "lynx", label v2_6
Diffstat (limited to 'WWW/Library/Implementation/SGML.c')
-rw-r--r-- | WWW/Library/Implementation/SGML.c | 2145 |
1 files changed, 2145 insertions, 0 deletions
diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c new file mode 100644 index 00000000..9cddd35a --- /dev/null +++ b/WWW/Library/Implementation/SGML.c @@ -0,0 +1,2145 @@ +/* General SGML Parser code SGML.c +** ======================== +** +** This module implements an HTStream object. To parse an +** SGML file, create this object which is a parser. The object +** is (currently) created by being passed a DTD structure, +** and a target HTStructured oject at which to throw the parsed stuff. +** +** 6 Feb 93 Binary seraches used. Intreface modified. +*/ +#include "HTUtils.h" +#include "tcp.h" /* For FROMASCII */ + +#include "SGML.h" +#include "HTMLDTD.h" +#include "HTCJK.h" + +#include <ctype.h> +/*#include <stdio.h> included in HTUtils.h -- FM */ +#include "HTChunk.h" + +#include "LYLeaks.h" + +#define INVALID (-1) + +#define FREE(x) if (x) {free(x); x = NULL;} + +PUBLIC HTCJKlang HTCJK = NOCJK; /* CJK enum value. */ +PUBLIC BOOL HTPassEightBitRaw = FALSE; /* Pass 161-172,174-255 raw. */ +PUBLIC BOOL HTPassEightBitNum = FALSE; /* Pass ^ numeric entities raw. */ +PUBLIC BOOL HTPassHighCtrlRaw = FALSE; /* Pass 127-160,173, raw. */ +PUBLIC BOOL HTPassHighCtrlNum = FALSE; /* Pass €-Ÿ raw. */ + +extern BOOLEAN LYCheckForCSI PARAMS((HTStructured *target, char **url)); +extern void LYDoCSI PARAMS((char *url, CONST char *comment, char **csi)); + +/* The State (context) of the parser +** +** This is passed with each call to make the parser reentrant +** +*/ + +#define MAX_ATTRIBUTES 36 /* Max number of attributes per element */ + + +/* Element Stack +** ------------- +** This allows us to return down the stack reselcting styles. +** As we return, attribute values will be garbage in general. +*/ +typedef struct _HTElement HTElement; +struct _HTElement { + HTElement * next; /* Previously nested element or 0 */ + HTTag* tag; /* The tag at this level */ +}; + + +/* Internal Context Data Structure +** ------------------------------- +*/ +struct _HTStream { + + CONST HTStreamClass * isa; /* inherited from HTStream */ + + CONST SGML_dtd *dtd; + HTStructuredClass *actions; /* target class */ + HTStructured *target; /* target object */ + + HTTag *current_tag; + int current_attribute_number; + HTChunk *string; + HTElement *element_stack; + enum sgml_state { S_text, S_litteral, + S_tag, S_tag_gap, S_attr, S_attr_gap, S_equals, S_value, + S_ero, S_cro, + S_exclamation, S_comment, S_doctype, S_marked, + S_sgmlent, S_sgmlele, S_sgmlatt, + S_squoted, S_dquoted, S_end, S_entity, + S_esc, S_dollar, S_paren, S_nonascii_text, + S_dollar_paren, + S_esc_sq, S_dollar_sq, S_paren_sq, S_nonascii_text_sq, + S_dollar_paren_sq, + S_esc_dq, S_dollar_dq, S_paren_dq, S_nonascii_text_dq, + S_dollar_paren_dq, + S_in_kanji, S_junk_tag} state; +#ifdef CALLERDATA + void * callerData; +#endif /* CALLERDATA */ + BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */ + char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */ + + BOOL lead_exclamation; + BOOL first_dash; + BOOL end_comment; + BOOL doctype_bracket; + BOOL first_bracket; + BOOL second_bracket; + + char * recover; + int recover_index; + char * include; + int include_index; + char * url; + char * csi; + int csi_index; +} ; + + +#define PUTC(ch) ((*context->actions->put_character)(context->target, ch)) + +extern BOOL historical_comments; +extern BOOL minimal_comments; +extern BOOL soft_dquotes; + +/* Handle Attribute +** ---------------- +*/ +/* PUBLIC CONST char * SGML_default = ""; ?? */ + +PRIVATE void handle_attribute_name ARGS2( + HTStream *, context, + CONST char *, s) +{ + + HTTag * tag = context->current_tag; + attr * attributes = tag->attributes; + + int high, low, i, diff; /* Binary search for attribute name */ + for (low = 0, high = tag->number_of_attributes; + high > low; + diff < 0 ? (low = i+1) : (high = i)) { + i = (low + (high-low)/2); + diff = strcasecomp(attributes[i].name, s); + if (diff == 0) { /* success: found it */ + context->current_attribute_number = i; + context->present[i] = YES; + FREE(context->value[i]); + return; + } /* if */ + + } /* for */ + + if (TRACE) + fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n", + s, context->current_tag->name); + context->current_attribute_number = INVALID; /* Invalid */ +} + + +/* Handle attribute value +** ---------------------- +*/ +PRIVATE void handle_attribute_value ARGS2( + HTStream *, context, + CONST char *, s) +{ + if (context->current_attribute_number != INVALID) { + StrAllocCopy(context->value[context->current_attribute_number], s); + } else { + if (TRACE) + fprintf(stderr, "SGML: Attribute value %s ignored\n", s); + } + context->current_attribute_number = INVALID; /* can't have two assignments! */ +} + + +/* Handle entity +** ------------- +** +** On entry, +** s contains the entity name zero terminated +** Bugs: +** If the entity name is unknown, the terminator is treated as +** a printable non-special character in all cases, even if it is '<' +** Bug-fix: +** Modified SGML_character() so we only come here with terminator +** as '\0' and check a FoundEntity flag. -- Foteos Macrides +*/ + +PRIVATE BOOL FoundEntity = FALSE; + +PRIVATE void handle_entity ARGS2( + HTStream *, context, + char, term) +{ + CONST char ** entities = context->dtd->entity_names; + CONST char *s = context->string->data; + int high, low, i, diff; + + /* + ** Use Lynx special characters directly for nbsp, ensp, emsp, + ** thinsp, and shy so we go through the HTML_put_character() + ** filters instead of using HTML_put_string(). - FM + */ + if (!strcmp(s, "nbsp")) { + PUTC(1); + FoundEntity = TRUE; + return; + } + if (!strcmp(s, "ensp") || !strcmp(s, "emsp") || !strcmp(s, "thinsp")) { + PUTC(2); + FoundEntity = TRUE; + return; + } + if (!strcmp(s, "shy")) { + PUTC(7); + FoundEntity = TRUE; + return; + } + + /* + ** Handle all other entities normally. - FM + */ + FoundEntity = FALSE; + for (low = 0, high = context->dtd->number_of_entities; + high > low; + diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */ + i = (low + (high-low)/2); + diff = strcmp(entities[i], s); /* Case sensitive! */ + if (diff == 0) { /* success: found it */ + (*context->actions->put_entity)(context->target, i); + FoundEntity = TRUE; + return; + } + } + /* + ** If entity string not found, display as text. + */ + if (TRACE) + fprintf(stderr, "SGML: Unknown entity %s\n", s); + PUTC('&'); + { + CONST char *p; + for (p = s; *p; p++) { + PUTC(*p); + } + } + if (term != '\0') + PUTC(term); +} + + +/* Handle comment +** -------------- +*/ +PRIVATE void handle_comment ARGS1( + HTStream *, context) +{ + CONST char *s = context->string->data; + + if (TRACE) + fprintf(stderr, "SGML Comment:\n<%s>\n", s); + + if (context->csi == NULL && + strncmp(s, "!--#", 4) == 0 && + LYCheckForCSI(context->target, (char **)&context->url) == TRUE) { + LYDoCSI(context->url, s, (char **)&context->csi); + } + + return; +} + + +/* Handle identifier +** ----------------- +*/ +PRIVATE void handle_identifier ARGS1( + HTStream *, context) +{ + CONST char *s = context->string->data; + + if (TRACE) + fprintf(stderr, "SGML Identifier\n<%s>\n", s); + + return; +} + + +/* Handle doctype +** -------------- +*/ +PRIVATE void handle_doctype ARGS1( + HTStream *, context) +{ + CONST char *s = context->string->data; + + if (TRACE) + fprintf(stderr, "SGML Doctype\n<%s>\n", s); + + return; +} + + +/* Handle marked +** ------------- +*/ +PRIVATE void handle_marked ARGS1( + HTStream *, context) +{ + CONST char *s = context->string->data; + + if (TRACE) + fprintf(stderr, "SGML Marked Section:\n<%s>\n", s); + + return; +} + + +/* Handle sgmlent +** -------------- +*/ +PRIVATE void handle_sgmlent ARGS1( + HTStream *, context) +{ + CONST char *s = context->string->data; + + if (TRACE) + fprintf(stderr, "SGML Entity Declaration:\n<%s>\n", s); + + return; +} + + +/* Handle sgmlent +** -------------- +*/ +PRIVATE void handle_sgmlele ARGS1( + HTStream *, context) +{ + CONST char *s = context->string->data; + + if (TRACE) + fprintf(stderr, "SGML Element Declaration:\n<%s>\n", s); + + return; +} + + +/* Handle sgmlatt +** -------------- +*/ +PRIVATE void handle_sgmlatt ARGS1( + HTStream *, context) +{ + CONST char *s = context->string->data; + + if (TRACE) + fprintf(stderr, "SGML Attribute Declaration:\n<%s>\n", s); + + return; +} + + +/* End element +** ----------- +*/ +PRIVATE void end_element ARGS2( + HTStream *, context, + HTTag *, old_tag) +{ + if (TRACE) + fprintf(stderr, "SGML: End </%s>\n", old_tag->name); + if (old_tag->contents == SGML_EMPTY) { + if (TRACE) + fprintf(stderr, "SGML: Illegal end tag </%s> found.\n", + old_tag->name); + return; + } +#ifdef WIND_DOWN_STACK + while (context->element_stack) { /* Loop is error path only */ +#else + if (context->element_stack) { /* Substitute and remove one stack element */ +#endif /* WIND_DOWN_STACK */ + HTElement * N = context->element_stack; + HTTag * t = N->tag; + + if (old_tag != t) { /* Mismatch: syntax error */ + if (context->element_stack->next) { /* This is not the last level */ + if (TRACE) fprintf(stderr, + "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n", + old_tag->name, t->name, t->name); + } else { /* last level */ + if (TRACE) fprintf(stderr, + "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n", + old_tag->name, t->name, old_tag->name); + return; /* Ignore */ + } + } + + context->element_stack = N->next; /* Remove from stack */ + FREE(N); + (*context->actions->end_element)(context->target, + t - context->dtd->tags, (char **)&context->include); +#ifdef WIND_DOWN_STACK + if (old_tag == t) + return; /* Correct sequence */ +#else + return; +#endif /* WIND_DOWN_STACK */ + + /* Syntax error path only */ + + } + if (TRACE) + fprintf(stderr, "SGML: Extra end tag </%s> found and ignored.\n", + old_tag->name); +} + + +/* Start a element +*/ +PRIVATE void start_element ARGS1( + HTStream *, context) +{ + HTTag * new_tag = context->current_tag; + + if (TRACE) + fprintf(stderr, "SGML: Start <%s>\n", new_tag->name); + (*context->actions->start_element)( + context->target, + new_tag - context->dtd->tags, + context->present, + (CONST char**) context->value, /* coerce type for think c */ + (char **)&context->include); + if (new_tag->contents != SGML_EMPTY) { /* i.e. tag not empty */ + HTElement * N = (HTElement *)malloc(sizeof(HTElement)); + if (N == NULL) + outofmem(__FILE__, "start_element"); + N->next = context->element_stack; + N->tag = new_tag; + context->element_stack = N; + } +} + + +/* Find Tag in DTD tag list +** ------------------------ +** +** On entry, +** dtd points to dtd structire including valid tag list +** string points to name of tag in question +** +** On exit, +** returns: +** NULL tag not found +** else address of tag structure in dtd +*/ +PUBLIC HTTag * SGMLFindTag ARGS2( + CONST SGML_dtd*, dtd, + CONST char *, string) +{ + int high, low, i, diff; + for (low = 0, high=dtd->number_of_tags; + high > low; + diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */ + i = (low + (high-low)/2); + diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */ + if (diff == 0) { /* success: found it */ + return &dtd->tags[i]; + } + } + return NULL; +} + +/*________________________________________________________________________ +** Public Methods +*/ + + +/* Could check that we are back to bottom of stack! @@ */ + +PUBLIC void SGML_free ARGS1( + HTStream *, context) +{ + int i; + HTElement * cur; + HTElement * next; + + (*context->actions->_free)(context->target); + HTChunkFree(context->string); + + /* free strings */ + for (i = 0; i < MAX_ATTRIBUTES; i++) + FREE(context->value[i]); + + FREE(context->recover); + FREE(context->include); + FREE(context->url); + FREE(context->csi); + + cur = context->element_stack; + while (cur) { + next = cur->next; + FREE(cur); + cur = next; + } + + FREE(context); +} + +PUBLIC void SGML_abort ARGS2( + HTStream *, context, + HTError, e) +{ + int i; + + (*context->actions->_abort)(context->target, e); + HTChunkFree(context->string); + + /* free strings */ + for (i = 0; i < MAX_ATTRIBUTES; i++) + FREE(context->value[i]); + + FREE(context->recover); + FREE(context->include); + FREE(context->url); + FREE(context->csi); + + FREE(context); +} + + +/* Read and write user callback handle +** ----------------------------------- +** +** The callbacks from the SGML parser have an SGML context parameter. +** These calls allow the caller to associate his own context with a +** particular SGML context. +*/ + +#ifdef CALLERDATA +PUBLIC void* SGML_callerData ARGS1( + HTStream *, context) +{ + return context->callerData; +} + +PUBLIC void SGML_setCallerData ARGS2( + HTStream *, context, + void*, data) +{ + context->callerData = data; +} +#endif /* CALLERDATA */ + +PUBLIC void SGML_character ARGS2( + HTStream *, context, + char, c) +{ + CONST SGML_dtd *dtd = context->dtd; + HTChunk *string = context->string; + CONST char * EntityName; + extern int current_char_set; + extern char *LYchar_set_names[]; + extern CONST char * HTMLGetEntityName PARAMS((int i)); + +top: + /* + ** Ignore low ISO 646 7-bit control characters + ** if HTCJK is not set. - FM + */ + if ((unsigned char)c < 32 && + c != 9 && c != 10 && c != 13 && + HTCJK == NOCJK) + return; + + /* + ** Ignore 127 if we don't have HTPassHighCtrlRaw + ** or HTCJK set. - FM + */ + if (c == 127 && + !(HTPassHighCtrlRaw || HTCJK != NOCJK)) + return; + + /* + ** Ignore 8-bit control characters 128 - 159 if + ** neither HTPassHighCtrlRaw nor HTCJK is set. - FM + */ + if ((unsigned char)c > 127 && (unsigned char)c < 160 && + !(HTPassHighCtrlRaw || HTCJK != NOCJK)) + return; + + /* + ** Handle character based on context->state. + */ + switch(context->state) { + + case S_in_kanji: + context->state = S_text; + PUTC(c); + break; + + case S_text: + if (HTCJK != NOCJK && (c & 0200) != 0) { + /* + ** Setting up for Kanji multibyte handling (based on + ** Takuya ASADA's (asada@three-a.co.jp) CJK Lynx). - FM + */ + context->state = S_in_kanji; + PUTC(c); + break; + } else if (HTCJK != NOCJK && c == '\033') { + /* + ** Setting up for CJK escape sequence handling (based on + ** Takuya ASADA's (asada@three-a.co.jp) CJK Lynx). - FM + */ + context->state = S_esc; + PUTC(c); + break; + } + if (c == '&' && (!context->element_stack || + (context->element_stack->tag && + (context->element_stack->tag->contents == + SGML_MIXED || + context->element_stack->tag->contents == + SGML_RCDATA)))) { + /* + ** Setting up for possible entity, without the leading '&'. - FM + */ + string->size = 0; + context->state = S_ero; + } else if (c == '<') { + /* + ** Setting up for possible tag. - FM + */ + string->size = 0; + context->state = (context->element_stack && + context->element_stack->tag && + context->element_stack->tag->contents == SGML_LITTERAL) + ? + S_litteral : S_tag; + /* + ** Convert 160 (nbsp) to Lynx special character if + ** neither HTPassHighCtrlRaw nor HTCJK is set. - FM + */ + } else if ((unsigned char)c == 160 && + !(HTPassHighCtrlRaw || HTCJK != NOCJK)) { + PUTC(1); + /* + ** Convert 173 (shy) to Lynx special character if + ** neither HTPassHighCtrlRaw nor HTCJK is set. - FM + */ + } else if ((unsigned char)c == 173 && + !(HTPassHighCtrlRaw || HTCJK != NOCJK)) { + PUTC(7); + /* + ** If it's any other (> 160) 8-bit chararcter, and + ** we have not set HTPassEightBitRaw nor HTCJK, nor + ** have the "ISO Latin 1" character set selected, + ** back translate for our character set. - FM + */ + } else if ((unsigned char)c > 160 && + !(HTPassEightBitRaw || HTCJK != NOCJK) && + strncmp(LYchar_set_names[current_char_set], + "ISO Latin 1", 11)) { + int i; + int value; + + string->size = 0; + value = (int)((unsigned char)c - 160); + EntityName = HTMLGetEntityName(value); + for (i = 0; EntityName[i]; i++) + HTChunkPutc(string, EntityName[i]); + HTChunkTerminate(string); + handle_entity(context, '\0'); + string->size = 0; + if (!FoundEntity) + PUTC(';'); + /* + ** If we get to here, pass the character. - FM + */ + } else { + PUTC(c); + } + break; + + /* + ** In litteral mode, waits only for specific end tag (for + ** compatibility with old servers, and for Lynx). - FM + */ + case S_litteral : + HTChunkPutc(string, c); + if (TOUPPER(c) != ((string->size == 1) ? + '/' : + context->element_stack->tag->name[string->size-2])) { + int i; + + /* + ** If complete match, end litteral. + */ + if ((c == '>') && + (!context->element_stack->tag->name[string->size-2])) { + end_element(context, context->element_stack->tag); + string->size = 0; + context->current_attribute_number = INVALID; + context->state = S_text; + break; + } + /* + ** If Mismatch: recover string. + */ + PUTC('<'); + for (i = 0; i < string->size; i++) /* recover */ + PUTC(string->data[i]); + string->size = 0; + context->state = S_text; + } + break; + + /* + ** Character reference (numeric entity) or named entity. + */ + case S_ero: + if (c == '#') { + /* + ** Setting up for possible numeric entity. + */ + context->state = S_cro; /* &# is Char Ref Open */ + break; + } + context->state = S_entity; /* Fall through! */ + + /* + ** Handle possible named entity. + */ + case S_entity: + if ((unsigned char)c < 127 && isalnum((unsigned char)c)) { + /* + ** Accept valid ASCII character. - FM + */ + HTChunkPutc(string, c); + } else if (string->size == 0) { + /* + ** It was an ampersand that's just text, so output + ** the ampersand and recycle this character. - FM + */ + PUTC('&'); + context->state = S_text; + goto top; + } else { + /* + ** Terminate entity name and try to handle it. - FM + */ + HTChunkTerminate(string); + handle_entity(context, '\0'); + string->size = 0; + context->state = S_text; + /* + ** Don't eat the terminator if we didn't find the + ** entity name and therefore sent the raw string + ** via handle_entity(), or if the terminator is + ** not the "standard" semi-colon for HTML. - FM + */ + if (!FoundEntity || c != ';') + goto top; + } + break; + + /* + ** Handle possible numeric entity. + */ + case S_cro: + if ((unsigned char)c < 127 && isdigit((unsigned char)c)) { + /* + ** Accept only valid ASCII digits. - FM + */ + HTChunkPutc(string, c); /* accumulate a character NUMBER */ + } else if (string->size == 0) { + /* + ** No digits following the "&#" so recover + ** them and recycle the character. - FM + */ + PUTC('&'); + PUTC('#'); + context->state = S_text; + goto top; + } else if ((unsigned char)c > 127 || isalnum((unsigned char)c)) { + /* + ** We have digit(s), but not a valid terminator, + ** so recover the "&#" and digit(s) and recycle + ** the character. - FM + */ + int i; + PUTC('&'); + PUTC('#'); + for (i = 0; i < string->size; i++) /* recover */ + PUTC(string->data[i]); + string->size = 0; + context->state = S_text; + goto top; + } else { + /* + ** Terminate the numeric entity and try to handle it. - FM + */ + int value, i; + HTChunkTerminate(string); + if (sscanf(string->data, "%d", &value) == 1) { + if (value == 8482) { + /* + ** trade Treat as reg. - FM + */ + value = 174; + } + /* + ** Show the numeric entity if the value: + ** (1) Is greater than 255 (until we support Unicode). + ** (2) Is less than 32, and not valid or we don't + ** have HTCJK set. + ** (3) Is 127 and we don't have HTPassHighCtrlRaw or + ** HTCJK set. + ** (4) Is 128 - 159 and we don't have HTPassHighCtrlNum + ** set. + ** - FM + */ + if ((value > 255) || + (value < 32 && + value != 9 && value != 10 && value != 13 && + HTCJK == NOCJK) || + (value == 127 && + !(HTPassHighCtrlRaw || HTCJK != NOCJK)) || + (value > 127 && value < 160 && + !HTPassHighCtrlNum)) { + if (value == 8194 || value == 8195 || value == 8201) { + /* + ** ensp, emsp or thinsp. - FM + */ + PUTC(2); + break; + } + if (value == 8211 || value == 8212) { + /* + ** ndash or mdash. - FM + */ + PUTC('-'); + break; + } + /* + ** Unhandled or llegal value. Recover the "&#" + ** and digit(s), and recycle the terminator. - FM + */ + PUTC('&'); + PUTC('#'); + string->size--; + for (i = 0; i < string->size; i++) /* recover */ + PUTC(string->data[i]); + string->size = 0; + context->state = S_text; + goto top; + } else if (value == 160) { + /* + ** Use Lynx special character for 160 (nbsp). - FM + */ + PUTC(1); + } else if (value == 173) { + /* + ** Use Lynx special character for 173 (shy) - FM + */ + PUTC(7); + } else if (value < 161 || HTPassEightBitNum || + !strncmp(LYchar_set_names[current_char_set], + "ISO Latin 1", 11)) { + /* + ** No conversion needed. - FM + */ + PUTC(FROMASCII((char)value)); + } else { + /* + ** Convert and handle as named entity. - FM + */ + value -= 160; + EntityName = HTMLGetEntityName(value); + if (EntityName && EntityName[0] != '\0') { + string->size = 0; + for (i = 0; EntityName[i]; i++) + HTChunkPutc(string, EntityName[i]); + HTChunkTerminate(string); + handle_entity(context, '\0'); + /* + ** Add a semi-colon if something went wrong + ** and handle_entity() sent the string. - FM + */ + if (!FoundEntity) { + PUTC(';'); + } + } else { + /* + ** Our conversion failed, so recover the "&#" + ** and digit(s), and recycle the terminator. - FM + */ + PUTC('&'); + PUTC('#'); + string->size--; + for (i = 0; i < string->size; i++) /* recover */ + PUTC(string->data[i]); + string->size = 0; + context->state = S_text; + goto top; + } + } + /* + ** If we get to here, we succeeded. Hoorah!!! - FM + */ + string->size = 0; + context->state = S_text; + /* + ** Don't eat the terminator if it's not + ** the "standard" semi-colon for HTML. - FM + */ + if (c != ';') + goto top; + } else { + /* + ** Not an entity, and don't know why not, so add the + ** terminator to the string, output the "&#", and + ** process the string via the recover element. - FM + */ + string->size--; + HTChunkPutc(string, c); + HTChunkTerminate(string); + PUTC('&'); + PUTC('#'); + if (context->recover == NULL) { + StrAllocCopy(context->recover, string->data); + context->recover_index = 0; + } else { + StrAllocCat(context->recover, string->data); + } + string->size = 0; + context->state = S_text; + break; + } + } + break; + + /* + ** Tag + */ + case S_tag: /* new tag */ + if ((unsigned char)c < 127 && isalnum((unsigned char)c)) { + /* + ** Add valid ASCII character. - FM + */ + HTChunkPutc(string, c); + } else if (c == '!' && !string->size) { /* <! */ + /* + ** Terminate and set up for possible comment, + ** identifier, declaration, or marked section. - FM + */ + context->state = S_exclamation; + context->lead_exclamation = TRUE; + context->doctype_bracket = FALSE; + context->first_bracket = FALSE; + HTChunkPutc(string, c); + break; + } else if (WHITE(c) && !string->size) { /* <WHITE */ + /* + ** Recover the '<' and WHITE character. - FM + */ + context->state = S_text; + PUTC('<'); + goto top; + } else { /* End of tag name */ + /* + ** Try to handle tag. - FM + */ + HTTag * t; + if (c == '/') { + if (TRACE) + if (string->size!=0) + fprintf(stderr,"SGML: `<%s/' found!\n", string->data); + context->state = S_end; + break; + } + HTChunkTerminate(string) ; + + t = SGMLFindTag(dtd, string->data); + if (!t) { + if (c == ':' && 0 == strcasecomp(string->data, "URL")) { + /* + ** Treat <URL: as text rather than a junk tag, + ** so we display it and the URL (Lynxism 8-). - FM + */ + int i; + PUTC('<'); + for (i = 0; i < 3; i++) /* recover */ + PUTC(string->data[i]); + PUTC(c); + if (TRACE) + fprintf(stderr, "SGML: Treating <%s%c as text\n", + string->data, c); + string->size = 0; + context->state = S_text; + } else { + if (TRACE) + fprintf(stderr, "SGML: *** Unknown element %s\n", + string->data); + context->state = (c == '>') ? S_text : S_junk_tag; + } + break; + } + context->current_tag = t; + + /* + ** Clear out attributes. + */ + { + int i; + for (i=0; i< context->current_tag->number_of_attributes; i++) + context->present[i] = NO; + } + string->size = 0; + context->current_attribute_number = INVALID; + + if (c == '>') { + if (context->current_tag->name) + start_element(context); + context->state = S_text; + } else { + context->state = S_tag_gap; + } + } + break; + + case S_exclamation: + if (context->lead_exclamation && c == '-') { + /* + ** Set up for possible comment. - FM + */ + context->lead_exclamation = FALSE; + context->first_dash = TRUE; + HTChunkPutc(string, c); + break; + } + if (context->lead_exclamation && c == '[') { + /* + ** Set up for possible marked section. - FM + */ + context->lead_exclamation = FALSE; + context->first_bracket = TRUE; + context->second_bracket = FALSE; + HTChunkPutc(string, c); + context->state = S_marked; + break; + } + if (context->first_dash && c == '-') { + /* + ** Set up to handle comment. - FM + */ + context->lead_exclamation = FALSE; + context->first_dash = FALSE; + context->end_comment = FALSE; + HTChunkPutc(string, c); + context->state = S_comment; + break; + } + context->lead_exclamation = FALSE; + context->first_dash = FALSE; + if (c == '>') { + /* + ** Try to handle identifier. - FM + */ + HTChunkTerminate(string); + handle_identifier(context); + string->size = 0; + context->state = S_text; + break; + } + if (WHITE(c)) { + if (string->size == 8 && + !strncasecomp(string->data, "!DOCTYPE", 8)) { + /* + ** Set up for DOCTYPE declaration. - FM + */ + HTChunkPutc(string, c); + context->doctype_bracket = FALSE; + context->state = S_doctype; + break; + } + if (string->size == 7 && + !strncasecomp(string->data, "!ENTITY", 7)) { + /* + ** Set up for ENTITY declaration. - FM + */ + HTChunkPutc(string, c); + context->first_dash = FALSE; + context->end_comment = TRUE; + context->state = S_sgmlent; + break; + } + if (string->size == 8 && + !strncasecomp(string->data, "!ELEMENT", 8)) { + /* + ** Set up for ELEMENT declaration. - FM + */ + HTChunkPutc(string, c); + context->first_dash = FALSE; + context->end_comment = TRUE; + context->state = S_sgmlele; + break; + } + if (string->size == 8 && + !strncasecomp(string->data, "!ATTLIST", 8)) { + /* + ** Set up for ATTLIST declaration. - FM + */ + HTChunkPutc(string, c); + context->first_dash = FALSE; + context->end_comment = TRUE; + context->state = S_sgmlatt; + break; + } + } + HTChunkPutc(string, c); + break; + + case S_comment: /* Expecting comment. - FM */ + if (historical_comments) { + /* + ** Any '>' terminates. - FM + */ + if (c == '>') { + HTChunkTerminate(string); + handle_comment(context); + string->size = 0; + context->end_comment = FALSE; + context->first_dash = FALSE; + context->state = S_text; + break; + } + HTChunkPutc(string, c); + break; + } + if (!context->first_dash && c == '-') { + HTChunkPutc(string, c); + context->first_dash = TRUE; + break; + } + if (context->first_dash && c == '-') { + HTChunkPutc(string, c); + context->first_dash = FALSE; + if (!context->end_comment) + context->end_comment = TRUE; + else if (!minimal_comments) + /* + ** Validly treat '--' pairs as successive comments + ** (for minimal, any "--WHITE>" terminates). - FM + */ + context->end_comment = FALSE; + break; + } + if (context->end_comment && c == '>') { + /* + ** Terminate and handle the comment. - FM + */ + HTChunkTerminate(string); + handle_comment(context); + string->size = 0; + context->end_comment = FALSE; + context->first_dash = FALSE; + context->state = S_text; + break; + } + context->first_dash = FALSE; + if (context->end_comment && !isspace(c)) + context->end_comment = FALSE; + HTChunkPutc(string, c); + break; + + case S_doctype: /* Expecting DOCTYPE. - FM */ + if (context->doctype_bracket) { + HTChunkPutc(string, c); + if (c == ']') + context->doctype_bracket = FALSE; + break; + } + if (c == '[' && WHITE(string->data[string->size - 1])) { + HTChunkPutc(string, c); + context->doctype_bracket = TRUE; + break; + } + if (c == '>') { + HTChunkTerminate(string); + handle_doctype(context); + string->size = 0; + context->state = S_text; + break; + } + HTChunkPutc(string, c); + break; + + case S_marked: /* Expecting marked section. - FM */ + if (context->first_bracket && c == '[') { + HTChunkPutc(string, c); + context->first_bracket = FALSE; + context->second_bracket = TRUE; + break; + } + if (context->second_bracket && c == ']' && + string->data[string->size - 1] == ']') { + HTChunkPutc(string, c); + context->second_bracket = FALSE; + break; + } + if (!context->second_bracket && c == '>') { + HTChunkTerminate(string); + handle_marked(context); + string->size = 0; + context->state = S_text; + break; + } + HTChunkPutc(string, c); + break; + + case S_sgmlent: /* Expecting ENTITY. - FM */ + if (!context->first_dash && c == '-') { + HTChunkPutc(string, c); + context->first_dash = TRUE; + break; + } + if (context->first_dash && c == '-') { + HTChunkPutc(string, c); + context->first_dash = FALSE; + if (!context->end_comment) + context->end_comment = TRUE; + else + context->end_comment = FALSE; + break; + } + if (context->end_comment && c == '>') { + HTChunkTerminate(string); + handle_sgmlent(context); + string->size = 0; + context->end_comment = FALSE; + context->first_dash = FALSE; + context->state = S_text; + break; + } + context->first_dash = FALSE; + HTChunkPutc(string, c); + break; + + case S_sgmlele: /* Expecting ELEMENT. - FM */ + if (!context->first_dash && c == '-') { + HTChunkPutc(string, c); + context->first_dash = TRUE; + break; + } + if (context->first_dash && c == '-') { + HTChunkPutc(string, c); + context->first_dash = FALSE; + if (!context->end_comment) + context->end_comment = TRUE; + else + context->end_comment = FALSE; + break; + } + if (context->end_comment && c == '>') { + HTChunkTerminate(string); + handle_sgmlele(context); + string->size = 0; + context->end_comment = FALSE; + context->first_dash = FALSE; + context->state = S_text; + break; + } + context->first_dash = FALSE; + HTChunkPutc(string, c); + break; + + case S_sgmlatt: /* Expecting ATTLIST. - FM */ + if (!context->first_dash && c == '-') { + HTChunkPutc(string, c); + context->first_dash = TRUE; + break; + } + if (context->first_dash && c == '-') { + HTChunkPutc(string, c); + context->first_dash = FALSE; + if (!context->end_comment) + context->end_comment = TRUE; + else + context->end_comment = FALSE; + break; + } + if (context->end_comment && c == '>') { + HTChunkTerminate(string); + handle_sgmlatt(context); + string->size = 0; + context->end_comment = FALSE; + context->first_dash = FALSE; + context->state = S_text; + break; + } + context->first_dash = FALSE; + HTChunkPutc(string, c); + break; + + case S_tag_gap: /* Expecting attribute or '>' */ + if (WHITE(c)) + break; /* Gap between attributes */ + if (c == '>') { /* End of tag */ + if (context->current_tag->name) + start_element(context); + context->state = S_text; + break; + } + HTChunkPutc(string, c); + context->state = S_attr; /* Get attribute */ + break; + + /* accumulating value */ + case S_attr: + if (WHITE(c) || (c == '>') || (c == '=')) { /* End of word */ + HTChunkTerminate(string); + handle_attribute_name(context, string->data); + string->size = 0; + if (c == '>') { /* End of tag */ + if (context->current_tag->name) + start_element(context); + context->state = S_text; + break; + } + context->state = (c == '=' ? S_equals: S_attr_gap); + } else { + HTChunkPutc(string, c); + } + break; + + case S_attr_gap: /* Expecting attribute or '=' or '>' */ + if (WHITE(c)) + break; /* Gap after attribute */ + if (c == '>') { /* End of tag */ + if (context->current_tag->name) + start_element(context); + context->state = S_text; + break; + } else if (c == '=') { + context->state = S_equals; + break; + } + HTChunkPutc(string, c); + context->state = S_attr; /* Get next attribute */ + break; + + case S_equals: /* After attr = */ + if (WHITE(c)) + break; /* Before attribute value */ + if (c == '>') { /* End of tag */ + if (TRACE) + fprintf(stderr, "SGML: found = but no value\n"); + if (context->current_tag->name) + start_element(context); + context->state = S_text; + break; + + } else if (c == '\'') { + context->state = S_squoted; + break; + + } else if (c == '"') { + context->state = S_dquoted; + break; + } + HTChunkPutc(string, c); + context->state = S_value; + break; + + case S_value: + if (WHITE(c) || (c == '>')) { /* End of word */ + HTChunkTerminate(string) ; + handle_attribute_value(context, string->data); + string->size = 0; + if (c == '>') { /* End of tag */ + if (context->current_tag->name) + start_element(context); + context->state = S_text; + break; + } + else context->state = S_tag_gap; + } else { + HTChunkPutc(string, c); + } + break; + + case S_squoted: /* Quoted attribute value */ + if (c == '\'') { /* End of attribute value */ + HTChunkTerminate(string) ; + handle_attribute_value(context, string->data); + string->size = 0; + context->state = S_tag_gap; + } else if (c == '\033') { + /* + ** Setting up for possible single quotes in CJK escape + ** sequences. - Takuya ASADA (asada@three-a.co.jp) + */ + context->state = S_esc_sq; + HTChunkPutc(string, c); + } else { + HTChunkPutc(string, c); + } + break; + + case S_dquoted: /* Quoted attribute value */ + if (c == '"' || /* Valid end of attribute value */ + (soft_dquotes && /* If emulating old Netscape bug, treat '>' */ + c == '>')) { /* as a co-terminator of dquoted and tag */ + HTChunkTerminate(string) ; + handle_attribute_value(context, string->data); + string->size = 0; + context->state = S_tag_gap; + if (c == '>') /* We emulated the Netscape bug, so we go */ + goto top; /* back and treat it as the tag terminator */ + } else if (c == '\033') { + /* + ** Setting up for possible double quotes in CJK escape + ** sequences. - Takuya ASADA (asada@three-a.co.jp) + */ + context->state = S_esc_dq; + HTChunkPutc(string, c); + } else { + HTChunkPutc(string, c); + } + break; + + case S_end: /* </ */ + if ((unsigned char)c < 127 && isalnum((unsigned char)c)) + HTChunkPutc(string, c); + else { /* End of end tag name */ + HTTag * t=0; + HTChunkTerminate(string) ; + if (!*string->data) { /* Empty end tag */ + if (context->element_stack) + t = context->element_stack->tag; + } else { + t = SGMLFindTag(dtd, string->data); + } + if (!t) { + if (TRACE) + fprintf(stderr, "Unknown end tag </%s>\n", string->data); + } else { + BOOL tag_OK = (c == '>' || WHITE(c)); + context->current_tag = t; + if (tag_OK && + (!strcasecomp(string->data, "DD") || + !strcasecomp(string->data, "DT") || + !strcasecomp(string->data, "LI") || + !strcasecomp(string->data, "LH") || + !strcasecomp(string->data, "TD") || + !strcasecomp(string->data, "TH") || + !strcasecomp(string->data, "TR") || + !strcasecomp(string->data, "THEAD") || + !strcasecomp(string->data, "TFOOT") || + !strcasecomp(string->data, "TBODY") || + !strcasecomp(string->data, "COLGROUP"))) { + /* + ** Don't treat these end tags as invalid, + ** nor act on them. - FM + */ + if (TRACE) + fprintf(stderr, + "SGML: `</%s%c' found! Ignoring it.\n", + string->data, c); + string->size = 0; + context->current_attribute_number = INVALID; + if (c != '>') { + context->state = S_junk_tag; + } else { + context->state = S_text; + } + break; + } else if (tag_OK && + !strcasecomp(string->data, "P")) { + /* + ** Treat a P end tag like a P start tag (Ugh, + ** what a hack! 8-). - FM + */ + if (TRACE) + fprintf(stderr, + "SGML: `</%s%c' found! Treating as '<%s%c'.\n", + string->data, c, string->data, c); + { + int i; + for (i = 0; + i < context->current_tag->number_of_attributes; + i++) { + context->present[i] = NO; + } + } + string->size = 0; + context->current_attribute_number = INVALID; + if (context->current_tag->name) + start_element(context); + if (c != '>') { + context->state = S_junk_tag; + } else { + context->state = S_text; + } + break; + } else if (tag_OK && + !strcasecomp(string->data, "FONT")) { + /* + ** Treat a FONT end tag as a FONT start tag with + ** a dummy END attribute. It's too likely to be + ** interdigited and mess up the parsing, so we've + ** declared FONT as SGML_EMPTY and will handle the + ** end tag in HTML_start_element. - FM + */ + if (TRACE) + fprintf(stderr, + "SGML: `</%s%c' found! Treating as '<%s%c'.\n", + string->data, c, string->data, c); + { + int i; + for (i = 0; + i < context->current_tag->number_of_attributes; + i++) { + context->present[i] = (i == HTML_FONT_END); + } + } + string->size = 0; + context->current_attribute_number = INVALID; + if (context->current_tag->name) + start_element(context); + if (c != '>') { + context->state = S_junk_tag; + } else { + context->state = S_text; + } + break; + } else { + /* + ** Handle all other end tags normally. - FM + */ + end_element( context, context->current_tag); + } + } + + string->size = 0; + context->current_attribute_number = INVALID; + if (c != '>') { + if (TRACE && !WHITE(c)) + fprintf(stderr,"SGML: `</%s%c' found!\n", string->data, c); + context->state = S_junk_tag; + } else { + context->state = S_text; + } + } + break; + + + case S_esc: /* Expecting '$'or '(' following CJK ESC. */ + if (c == '$') { + context->state = S_dollar; + } else if (c == '(') { + context->state = S_paren; + } else { + context->state = S_text; + } + PUTC(c); + break; + + case S_dollar: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ + if (c == '@' || c == 'B' || c == 'A') { + context->state = S_nonascii_text; + } else if (c == '(') { + context->state = S_dollar_paren; + } + PUTC(c); + break; + + case S_dollar_paren: /* Expecting 'C' after CJK "ESC$(". */ + if (c == 'C') { + context->state = S_nonascii_text; + } else { + context->state = S_text; + } + PUTC(c); + break; + + case S_paren: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ + if (c == 'B' || c == 'J' || c == 'T') { + context->state = S_text; + } else if (c == 'I') { + context->state = S_nonascii_text; + } else { + context->state = S_text; + } + PUTC(c); + break; + + case S_nonascii_text: /* Expecting CJK ESC after non-ASCII text. */ + if (c == '\033') { + context->state = S_esc; + } + PUTC(c); + break; + + case S_esc_sq: /* Expecting '$'or '(' following CJK ESC. */ + if (c == '$') { + context->state = S_dollar_sq; + } else if (c == '(') { + context->state = S_paren_sq; + } else { + context->state = S_squoted; + } + HTChunkPutc(string, c); + break; + + case S_dollar_sq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ + if (c == '@' || c == 'B' || c == 'A') { + context->state = S_nonascii_text_sq; + } else if (c == '(') { + context->state = S_dollar_paren_sq; + } + HTChunkPutc(string, c); + break; + + case S_dollar_paren_sq: /* Expecting 'C' after CJK "ESC$(". */ + if (c == 'C') { + context->state = S_nonascii_text_sq; + } else { + context->state = S_squoted; + } + HTChunkPutc(string, c); + break; + + case S_paren_sq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ + if (c == 'B' || c == 'J' || c == 'T') { + context->state = S_squoted; + } else if (c == 'I') { + context->state = S_nonascii_text_sq; + } else { + context->state = S_squoted; + } + HTChunkPutc(string, c); + break; + + case S_nonascii_text_sq: /* Expecting CJK ESC after non-ASCII text. */ + if (c == '\033') { + context->state = S_esc_sq; + } + HTChunkPutc(string, c); + break; + + case S_esc_dq: /* Expecting '$'or '(' following CJK ESC. */ + if (c == '$') { + context->state = S_dollar_dq; + } else if (c == '(') { + context->state = S_paren_dq; + } else { + context->state = S_dquoted; + } + HTChunkPutc(string, c); + break; + + case S_dollar_dq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */ + if (c == '@' || c == 'B' || c == 'A') { + context->state = S_nonascii_text_dq; + } else if (c == '(') { + context->state = S_dollar_paren_dq; + } + HTChunkPutc(string, c); + break; + + case S_dollar_paren_dq: /* Expecting 'C' after CJK "ESC$(". */ + if (c == 'C') { + context->state = S_nonascii_text_dq; + } else { + context->state = S_dquoted; + } + HTChunkPutc(string, c); + break; + + case S_paren_dq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */ + if (c == 'B' || c == 'J' || c == 'T') { + context->state = S_dquoted; + } else if (c == 'I') { + context->state = S_nonascii_text_dq; + } else { + context->state = S_dquoted; + } + HTChunkPutc(string, c); + break; + + case S_nonascii_text_dq: /* Expecting CJK ESC after non-ASCII text. */ + if (c == '\033') { + context->state = S_esc_dq; + } + HTChunkPutc(string, c); + break; + + case S_junk_tag: + if (c == '>') { + context->state = S_text; + } + } /* switch on context->state */ + + /* + ** Check whether we've added anything to the recover buffer. - FM + */ + if (context->recover != NULL) { + if (context->recover[context->recover_index] == '\0') { + FREE(context->recover); + context->recover_index = 0; + } else { + c = context->recover[context->recover_index]; + context->recover_index++; + goto top; + } + } + + /* + ** Check whether an external function has added + ** anything to the include buffer. - FM + */ + if (context->include != NULL) { + if (context->include[context->include_index] == '\0') { + FREE(context->include); + context->include_index = 0; + } else { + c = context->include[context->include_index]; + context->include_index++; + goto top; + } + } + + /* + ** Check whether an external function has added + ** anything to the csi buffer. - FM + */ + if (context->csi != NULL) { + if (context->csi[context->csi_index] == '\0') { + FREE(context->csi); + context->csi_index = 0; + } else { + c = context->csi[context->csi_index]; + context->csi_index++; + goto top; + } + } +} /* SGML_character */ + + +PUBLIC void SGML_string ARGS2( + HTStream *, context, + CONST char*, str) +{ + CONST char *p; + for (p = str; *p; p++) + SGML_character(context, *p); +} + + +PUBLIC void SGML_write ARGS3( + HTStream *, context, + CONST char*, str, + int, l) +{ + CONST char *p; + CONST char *e = str+l; + for (p = str; p < e; p++) + SGML_character(context, *p); +} + +/*_______________________________________________________________________ +*/ + +/* Structured Object Class +** ----------------------- +*/ +PUBLIC CONST HTStreamClass SGMLParser = +{ + "SGMLParser", + SGML_free, + SGML_abort, + SGML_character, + SGML_string, + SGML_write, +}; + +/* Create SGML Engine +** ------------------ +** +** On entry, +** dtd represents the DTD, along with +** actions is the sink for the data as a set of routines. +** +*/ + +PUBLIC HTStream* SGML_new ARGS2( + CONST SGML_dtd *, dtd, + HTStructured *, target) +{ + int i; + HTStream* context = (HTStream *) malloc(sizeof(*context)); + if (!context) + outofmem(__FILE__, "SGML_begin"); + + context->isa = &SGMLParser; + context->string = HTChunkCreate(128); /* Grow by this much */ + context->dtd = dtd; + context->target = target; + context->actions = (HTStructuredClass*)(((HTStream*)target)->isa); + /* Ugh: no OO */ + context->state = S_text; + context->element_stack = 0; /* empty */ +#ifdef CALLERDATA + context->callerData = (void*) callerData; +#endif /* CALLERDATA */ + for (i = 0; i < MAX_ATTRIBUTES; i++) + context->value[i] = 0; + + context->lead_exclamation = FALSE; + context->first_dash = FALSE; + context->end_comment = FALSE; + context->doctype_bracket = FALSE; + context->first_bracket = FALSE; + context->second_bracket = FALSE; + context->recover = NULL; + + context->recover_index = 0; + context->include = NULL; + context->include_index = 0; + context->url = NULL; + context->csi = NULL; + context->csi_index = 0; + + return context; +} + +/* Asian character conversion functions +** ==================================== +** +** Added 24-Mar-96 by FM, based on: +** +//////////////////////////////////////////////////////////////////////// +Copyright (c) 1993 Electrotechnical Laboratry (ETL) + +Permission to use, copy, modify, and distribute this material +for any purpose and without fee is hereby granted, provided +that the above copyright notice and this permission notice +appear in all copies, and that the name of ETL not be +used in advertising or publicity pertaining to this +material without the specific, prior written permission +of an authorized representative of ETL. +ETL MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY +OF THIS MATERIAL FOR ANY PURPOSE. IT IS PROVIDED "AS IS", +WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES. +///////////////////////////////////////////////////////////////////////// +Content-Type: program/C; charset=US-ASCII +Program: SJIS.c +Author: Yutaka Sato <ysato@etl.go.jp> +Description: +History: + 930923 extracted from codeconv.c of cosmos +/////////////////////////////////////////////////////////////////////// +*/ + +PUBLIC int TREAT_SJIS = 1; + +PUBLIC void JISx0201TO0208_EUC ARGS4( + register unsigned char, IHI, + register unsigned char, ILO, + register unsigned char *, OHI, + register unsigned char *, OLO) +{ + static char *table[] = { + "\xA1\xA3", "\xA1\xD6", "\xA1\xD7", "\xA1\xA2", "\xA1\xA6", "\xA5\xF2", + "\xA5\xA1", "\xA5\xA3", "\xA5\xA5", "\xA5\xA7", "\xA5\xA9", + "\xA5\xE3", "\xA5\xE5", "\xA5\xE7", "\xA5\xC3", "\xA1\xBC", + "\xA5\xA2", "\xA5\xA4", "\xA5\xA6", "\xA5\xA8", "\xA5\xAA", + "\xA5\xAB", "\xA5\xAD", "\xA5\xAF", "\xA5\xB1", "\xA5\xB3", + "\xA5\xB5", "\xA5\xB7", "\xA5\xB9", "\xA5\xBB", "\xA5\xBD", + "\xA5\xBF", "\xA5\xC1", "\xA5\xC4", "\xA5\xC6", "\xA5\xC8", + "\xA5\xCA", "\xA5\xCB", "\xA5\xCC", "\xA5\xCD", "\xA5\xCE", + "\xA5\xCF", "\xA5\xD2", "\xA5\xD5", "\xA5\xD8", "\xA5\xDB", + "\xA5\xDE", "\xA5\xDF", "\xA5\xE0", "\xA5\xE1", "\xA5\xE2", + "\xA5\xE4", "\xA5\xE6", "\xA5\xE8", "\xA5\xE9", "\xA5\xEA", + "\xA5\xEB", "\xA5\xEC", "\xA5\xED", "\xA5\xEF", "\xA5\xF3", + "\xA1\xAB", "\xA1\xAC" + }; + + if ((IHI == 0x8E) && (ILO >= 0xA1) && (ILO <= 0xDF)) { + *OHI = table[ILO - 0xA1][0]; + *OLO = table[ILO - 0xA1][1]; + } else { + *OHI = IHI; + *OLO = ILO; + } +} + +PUBLIC unsigned char * SJIS_TO_JIS1 ARGS3( + register unsigned char, HI, + register unsigned char, LO, + register unsigned char *, JCODE) +{ + HI -= (HI <= 0x9F) ? 0x71 : 0xB1; + HI = (HI << 1) + 1; + if (0x7F < LO) + LO--; + if (0x9E <= LO) { + LO -= 0x7D; + HI++; + } else { + LO -= 0x1F; + } + JCODE[0] = HI; + JCODE[1] = LO; + return JCODE; +} + +PUBLIC unsigned char * JIS_TO_SJIS1 ARGS3( + register unsigned char, HI, + register unsigned char, LO, + register unsigned char *, SJCODE) +{ + if (HI & 1) + LO += 0x1F; + else + LO += 0x7D; + if (0x7F <= LO) + LO++; + + HI = ((HI - 0x21) >> 1) + 0x81; + if (0x9F < HI) + HI += 0x40; + SJCODE[0] = HI; + SJCODE[1] = LO; + return SJCODE; +} + +PUBLIC unsigned char * EUC_TO_SJIS1 ARGS3( + unsigned char, HI, + unsigned char, LO, + register unsigned char *, SJCODE) +{ + if (HI == 0x8E) JISx0201TO0208_EUC(HI, LO, &HI, &LO); + JIS_TO_SJIS1(HI&0x7F, LO&0x7F, SJCODE); + return SJCODE; +} + +PUBLIC void JISx0201TO0208_SJIS ARGS3( + register unsigned char, I, + register unsigned char *, OHI, + register unsigned char *, OLO) +{ + unsigned char SJCODE[2]; + + JISx0201TO0208_EUC('\x8E', I, OHI, OLO); + JIS_TO_SJIS1(*OHI&0x7F, *OLO&0x7F, SJCODE); + *OHI = SJCODE[0]; + *OLO = SJCODE[1]; +} + +PUBLIC unsigned char * SJIS_TO_EUC1 ARGS3( + unsigned char, HI, + unsigned char, LO, + unsigned char *, EUC) +{ + SJIS_TO_JIS1(HI, LO, EUC); + EUC[0] |= 0x80; + EUC[1] |= 0x80; + return EUC; +} + +PUBLIC unsigned char * SJIS_TO_EUC ARGS2( + unsigned char *, src, + unsigned char *, dst) +{ + register unsigned char hi, lo, *sp, *dp; + register int in_sjis = 0; + + for (sp = src, dp = dst; (0 != (hi = sp[0]));) { + lo = sp[1]; + if (TREAT_SJIS && IS_SJIS(hi, lo, in_sjis)) { + SJIS_TO_JIS1(hi,lo,dp); + dp[0] |= 0x80; + dp[1] |= 0x80; + dp += 2; + sp += 2; + } else { + *dp++ = *sp++; + } + } + *dp = 0; + return dst; +} + +PUBLIC unsigned char * EUC_TO_SJIS ARGS2( + unsigned char *, src, + unsigned char *, dst) +{ + register unsigned char *sp, *dp; + + for (sp = src, dp = dst; *sp;) { + if (*sp & 0x80) { + if (sp[1] && (sp[1] & 0x80)) { + JIS_TO_SJIS1(sp[0]&0x7F, sp[1]&0x7F, dp); + dp += 2; + sp += 2; + } else { + sp++; + } + } else { + *dp++ = *sp++; + } + } + *dp = 0; + return dst; +} + +PUBLIC unsigned char * EUC_TO_JIS ARGS4( + unsigned char *, src, + unsigned char *, dst, + CONST char *, toK, + CONST char *, toA) +{ + register unsigned char kana_mode = 0; + register unsigned char cch; + register unsigned char *sp = src; + register unsigned char *dp = dst; + register int i; + + while (0 != (cch = *sp++)) { + if (cch & 0x80) { + if (!kana_mode) { + kana_mode = ~kana_mode; + for (i = 0; toK[i]; i++) { + *dp++ = (unsigned char)toK[i]; + } + } + if (*sp & 0x80) { + *dp++ = cch & ~0x80; + *dp++ = *sp++ & ~0x80; + } + } else { + if (kana_mode) { + kana_mode = ~kana_mode; + for (i = 0; toA[i]; i++) { + *dp++ = (unsigned char)toA[i]; + *dp = '\0'; + } + } + *dp++ = cch; + } + } + if (kana_mode) { + for (i = 0; toA[i]; i++) { + *dp++ = (unsigned char)toA[i]; + } + } + + if (dp) + *dp = 0; + return dst; +} + +PUBLIC unsigned char * TO_EUC ARGS2( + unsigned char *, jis, + unsigned char *, euc) +{ + register unsigned char *s, *d, c, jis_stat; + register to1B, to2B; + register int in_sjis = 0; + + s = jis; + d = euc; + jis_stat = 0; + to2B = TO_2BCODE; + to1B = TO_1BCODE; + + while (0 != (c = *s++)) { + if (c == ESC) { + if (*s == to2B) { + if ((s[1] == 'B') || (s[1] == '@') || (s[1] == 'A')) { + jis_stat = 0x80; + s += 2; + continue; + } else if ((s[1] == '(') && s[2] && (s[2] == 'C')) { + jis_stat = 0x80; + s += 3; + continue; + } + } else { + if (*s == to1B) { + if ((s[1]=='B') || (s[1]=='J') || + (s[1]=='H') || (s[1]=='T')) { + jis_stat = 0; + s += 2; + continue; + } + } + } + } + if (IS_SJIS(c,*s,in_sjis)) { + SJIS_TO_EUC1(c, *s, d); + d += 2; + s++; + } else { + if (jis_stat && (0x20 < c)) { + *d++ = jis_stat | c; + } else { + *d++ = c; + } + } + } + *d = 0; + return euc; +} + +PUBLIC void TO_SJIS ARGS2( + unsigned char *, any, + unsigned char *, sjis) +{ + unsigned char *euc; + + if (!any || !sjis) + return; + + euc = (unsigned char*)malloc(strlen((CONST char *)any)+1); + if (euc == NULL) + outofmem(__FILE__, "TO_SJIS"); + + TO_EUC(any, euc); + EUC_TO_SJIS(euc, sjis); + FREE(euc); +} + +PUBLIC void TO_JIS ARGS2( + unsigned char *, any, + unsigned char *, jis) +{ + unsigned char *euc; + + if (!any || !jis) + return; + + euc = (unsigned char*)malloc(strlen((CONST char *)any)+1); + if (euc == NULL) + outofmem(__FILE__, "TO_JIS"); + + TO_EUC(any, euc); + EUC_TO_JIS(euc, jis, TO_KANJI, TO_ASCII); + FREE(euc); +} |