/* HTML to rich text converter for libwww * * THE HTML TO RTF OBJECT CONVERTER * * This interprets the HTML semantics. */ #ifndef HTML_H #define HTML_H #ifndef HTUTILS_H #include #endif /* HTUTILS_H */ #include #include #include #include /* #define ATTR_CS_IN (me->T.output_utf8 ? me->UCLYhndl : 0) */ #define ATTR_CS_IN me->tag_charset #define TRANSLATE_AND_UNESCAPE_ENTITIES(s, p, h) \ LYUCTranslateHTMLString(s, ATTR_CS_IN, current_char_set, YES, p, h, st_HTML) #define TRANSLATE_AND_UNESCAPE_ENTITIES5(s,cs_from,cs_to,p,h) \ LYUCTranslateHTMLString(s, cs_from, cs_to, YES, p, h, st_HTML) #define TRANSLATE_AND_UNESCAPE_ENTITIES6(s,cs_from,cs_to,spcls,p,h) \ LYUCTranslateHTMLString(s, cs_from, cs_to, spcls, p, h, st_HTML) #define TRANSLATE_HTML(s,p,h) \ LYUCFullyTranslateString(s, me->UCLYhndl, current_char_set, NO, YES, p, h, NO, st_HTML) #define TRANSLATE_HTML5(s,cs_from,cs_to,p,h) \ LYUCFullyTranslateString(s, cs_from, cs_to, NO, YES, p, h, NO, st_HTML) #define TRANSLATE_HTML7(s,cs_from,cs_to,spcls,p,h,Back) \ LYUCFullyTranslateString(s, cs_from, cs_to, NO, spcls, p, h, Back, st_HTML) /* * Strings from attributes which should be converted to some kind of "standard" * representation (character encoding), was Latin-1, esp. URLs (incl. * #fragments) and HTML NAME and ID stuff. */ #define TRANSLATE_AND_UNESCAPE_TO_STD(s) \ LYUCTranslateHTMLString(s, ATTR_CS_IN, ATTR_CS_IN, NO, NO, YES, st_URL) #define UNESCAPE_FIELDNAME_TO_STD(s) \ LYUCTranslateHTMLString(s, ATTR_CS_IN, ATTR_CS_IN, NO, NO, YES, st_HTML) extern const HTStructuredClass HTMLPresentation; #ifdef Lynx_HTML_Handler /* * This section is semi-private to HTML.c and it's helper modules. - FM * -------------------------------------------------------------------- */ typedef struct _stack_element { HTStyle *style; int tag_number; } stack_element; /* HTML Object * ----------- */ #define MAX_NESTING 800 /* Should be checked by parser */ struct _HTStructured { const HTStructuredClass *isa; HTParentAnchor *node_anchor; HText *text; HTStream *target; /* Output stream */ HTStreamClass targetClass; /* Output routines */ HTChildAnchor *CurrentA; /* current HTML_A anchor */ int CurrentANum; /* current HTML_A number */ char *base_href; /* current HTML_BASE href */ char *map_address; /* current HTML_MAP address */ HTChunk title; /* Grow by 128 */ HTChunk object; /* Grow by 128 */ BOOL object_started; BOOL object_declare; BOOL object_shapes; BOOL object_ismap; char *object_usemap; char *object_id; char *object_title; char *object_data; char *object_type; char *object_classid; char *object_codebase; char *object_codetype; char *object_name; int objects_mixed_open, objects_figged_open; HTChunk option; /* Grow by 128 */ BOOL first_option; /* First OPTION in SELECT? */ char *LastOptionValue; BOOL LastOptionChecked; BOOL select_disabled; HTChunk textarea; /* Grow by 128 */ char *textarea_name; int textarea_name_cs; char *textarea_accept_cs; char *textarea_cols; int textarea_rows; int textarea_disabled; char *textarea_id; HTChunk math; /* Grow by 128 */ HTChunk style_block; /* Grow by 128 */ HTChunk script; /* Grow by 128 */ /* * Used for nested lists. - FM */ int List_Nesting_Level; /* counter for list nesting level */ int OL_Counter[12]; /* counter for ordered lists */ char OL_Type[12]; /* types for ordered lists */ int Last_OL_Count; /* last count in ordered lists */ char Last_OL_Type; /* last type in ordered lists */ int Division_Level; short DivisionAlignments[MAX_NESTING]; int Underline_Level; int Quote_Level; BOOL UsePlainSpace; BOOL HiddenValue; int lastraw; char *comment_start; /* for literate programming */ char *comment_end; HTTag *current_tag; BOOL style_change; HTStyle *new_style; HTStyle *old_style; int current_default_alignment; BOOL in_word; /* Have just had a non-white char */ stack_element stack[MAX_NESTING]; stack_element *sp; /* Style stack pointer */ BOOL stack_overrun; /* Was MAX_NESTING exceeded? */ int skip_stack; /* flag to skip next style stack operation */ /* * Track if we are in an anchor, paragraph, address, base, etc. */ BOOL inA; BOOL inAPPLET; BOOL inAPPLETwithP; BOOL inBadBASE; BOOL inBadHREF; BOOL inBadHTML; BOOL inBASE; BOOL inBoldA; BOOL inBoldH; BOOL inCAPTION; BOOL inCREDIT; BOOL inFIG; BOOL inFIGwithP; BOOL inFONT; BOOL inFORM; BOOL inLABEL; BOOL inP; BOOL inPRE; BOOL inSELECT; BOOL inTABLE; BOOL inTEXTAREA; BOOL inUnderline; BOOL needBoldH; char *xinclude; /* if no include strin address passed */ /* * UCI and UCLYhndl give the UCInfo and charset registered for the HTML * parser in the node_anchor's UCStages structure. It indicates what is * fed to the HTML parser as the stream of character data (not necessarily * tags and attributes). It should currently always be set to be the same * as UCI and UCLhndl for the HTEXT stage in the node_anchor's UCStages * structure, since the HTML parser sends its input character data to the * output without further charset translation. */ LYUCcharset *UCI; int UCLYhndl; /* * inUCI and inUCLYhndl indicate the UCInfo and charset which the HTML * parser treats at the input charset. It is normally set to the UCI and * UCLhndl for the SGML parser in the node_anchor's UCStages structure * (which may be a dummy, based on the MIME parser's UCI and UCLhndl in * that structure, when we are handling a local file or non-http(s) * gateway). It could be changed temporarily by the HTML parser, for * conversions of attribute strings, but should be reset once done. - FM */ LYUCcharset *inUCI; int inUCLYhndl; /* * outUCI and outUCLYhndl indicate the UCInfo and charset which the HTML * parser treats as the output charset. It is normally set to its own UCI * and UCLhndl. It could be changed for conversions of attribute strings, * but should be reset once done. - FM */ LYUCcharset *outUCI; int outUCLYhndl; /* * T holds the transformation rules for conversions of strings between the * input and output charsets by the HTML parser. - FM */ UCTransParams T; int tag_charset; /* charset for attribute values etc. */ }; extern HTStyle *LYstyles(int style_number); extern BOOL LYBadHTML(HTStructured * me); /* * Semi-Private functions. - FM */ extern void HTML_put_character(HTStructured * me, char c); extern void HTML_put_string(HTStructured * me, const char *s); extern void HTML_write(HTStructured * me, const char *s, int l); extern int HTML_put_entity(HTStructured * me, int entity_number); extern void actually_set_style(HTStructured * me); /* Style buffering avoids dummy paragraph begin/ends. */ #define UPDATE_STYLE if (me->style_change) { actually_set_style(me); } #endif /* Lynx_HTML_Handler */ extern void strtolower(char *i); /* P U B L I C */ /* * HTConverter to present HTML */ extern HTStream *HTMLToPlain(HTPresentation *pres, HTParentAnchor *anchor, HTStream *sink); extern HTStream *HTMLParsedPresent(HTPresentation *pres, HTParentAnchor *anchor, HTStream *sink); extern HTStream *HTMLToC(HTPresentation *pres, HTParentAnchor *anchor, HTStream *sink); extern HTStream *HTMLPresent(HTPresentation *pres, HTParentAnchor *anchor, HTStream *sink); extern HTStructured *HTML_new(HTParentAnchor *anchor, HTFormat format_out, HTStream *target); /* * Record error message as a hypertext object. * * The error message should be marked as an error so that it can be reloaded * later. This implementation just throws up an error message and leaves the * document unloaded. * * On entry, * sink is a stream to the output device if any * number is the HTTP error number * message is the human readable message. * On exit, * a return code like HT_LOADED if object exists else 60; 0 */ extern int HTLoadError(HTStream *sink, int number, const char *message); #endif /* HTML_H */