# # # Nimrod's Runtime Library # (c) Copyright 2010 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## This module implements a simple high performance `XML`:idx: / `HTML`:idx: ## parser. ## The only encoding that is supported is UTF-8. The parser has been designed ## to be somewhat error correcting, so that even most "wild HTML" found on the ## web can be parsed with it. **Note:** This parser does not check that each ## ```` has a corresponding ````! These checks have do be ## implemented by the client code for various reasons: ## ## * Old HTML contains tags that have no end tag: ``
`` for example. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this ## library can parse both, only the client knows which comparison is to be ## used. ## * Thus the checks would have been very difficult to implement properly with ## little benefit, especially since they are simple to implement in the ## client. The client should use the `errorMsgExpected` proc to generate ## a nice error message that fits the other error messages this library ## creates. ## ## ## Example 1: Retrieve HTML title ## ============================== ## ## The file ``examples/htmltitle.nim`` demonstrates how to use the ## XML parser to accomplish a simple task: To determine the title of an HTML ## document. ## ## .. code-block:: nimrod ## :file: examples/htmltitle.nim ## ## ## Example 2: Retrieve all HTML links ## ================================== ## ## The file ``examples/htmlrefs.nim`` demonstrates how to use the ## XML parser to accomplish another simple task: To determine all the links ## an HTML document contains. ## ## .. code-block:: nimrod ## :file: examples/htmlrefs.nim ## import hashes, strutils, lexbase, streams, unicode # the parser treats ``
`` as ``

`` # xmlElementCloseEnd, ## ``/>`` type TXmlEventKind* = enum ## enumation of all events that may occur when parsing xmlError, ## an error ocurred during parsing xmlEof, ## end of file reached xmlCharData, ## character data xmlWhitespace, ## whitespace has been parsed xmlComment, ## a comment has been parsed xmlPI, ## processing instruction (````) xmlElementStart, ## ```` xmlElementEnd, ## ```` xmlElementOpen, ## ```` xmlCData, ## ```` xmlEntity, ## &entity; xmlSpecial ## ```` TXmlError* = enum ## enumeration that lists all errors that can occur errNone, ## no error errEndOfCDataExpected, ## ``]]>`` expected errNameExpected, ## name expected errSemicolonExpected, ## ``;`` expected errQmGtExpected, ## ``?>`` expected errGtExpected, ## ``>`` expected errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected ## ``-->`` expected TParserState = enum stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError TXmlParseOption* = enum ## options for the XML parser reportWhitespace, ## report whitespace reportComments ## report comments TXmlParser* = object of TBaseLexer ## the parser object. a, b, c: string kind: TXmlEventKind err: TXmlError state: TParserState filename: string options: set[TXmlParseOption] const errorMessages: array [TXmlError, string] = [ "no error", "']]>' expected", "name expected", "';' expected", "'?>' expected", "'>' expected", "'=' expected", "'\"' or \"'\" expected", "'-->' expected" ] proc open*(my: var TXmlParser, input: PStream, filename: string, options: set[TXmlParseOption] = {}) = ## initializes the parser with an input stream. `Filename` is only used ## for nice error messages. The parser's behaviour can be controlled by ## the `options` parameter: If `options` contains ``reportWhitespace`` ## a whitespace token is reported as an ``xmlWhitespace`` event. ## If `options` contains ``reportComments`` a comment token is reported as an ## ``xmlComment`` event. lexbase.open(my, input) my.filename = filename my.state = stateStart my.kind = xmlError my.a = "" my.b = "" my.options = options proc close*(my: var TXmlParser) {.inline.} = ## closes the parser `my` and its associated input stream. lexbase.close(my) proc charData*(my: TXmlParser): string {.inline.} = ## returns the character data for the events: ``xmlCharData``, ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial`` assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial}) return my.a proc kind*(my: TXmlParser): TXmlEventKind {.inline.} = ## returns the current event type for the XML parser return my.kind proc elementName*(my: TXmlParser): string {.inline.} = ## returns the element name for the events: ``xmlElementStart``, ## ``xmlElementEnd``, ``xmlElementOpen`` assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen}) return my.a proc entityName*(my: TXmlParser): string {.inline.} = ## returns the entity name for the event: ``xmlEntity`` assert(my.kind == xmlEntity) return my.a proc attrKey*(my: TXmlParser): string {.inline.} = ## returns the attribute key for the event ``xmlAttribute`` assert(my.kind == xmlAttribute) return my.a proc attrValue*(my: TXmlParser): string {.inline.} = ## returns the attribute value for the event ``xmlAttribute`` assert(my.kind == xmlAttribute) return my.b proc PIName*(my: TXmlParser): string {.inline.} = ## returns the processing instruction name for the event ``xmlPI`` assert(my.kind == xmlPI) return my.a proc PIRest*(my: TXmlParser): string {.inline.} = ## returns the rest of the processing instruction for the event ``xmlPI`` assert(my.kind == xmlPI) return my.b proc getColumn*(my: TXmlParser): int {.inline.} = ## get the current column the parser has arrived at. result = getColNumber(my, my.bufPos) proc getLine*(my: TXmlParser): int {.inline.} = ## get the current line the parser has arrived at. result = my.linenumber proc getFilename*(my: TXmlParser): string {.inline.} = ## get the filename of the file that the parser processes. result = my.filename proc errorMsg*(my: TXmlParser): string = ## returns a helpful error message for the event ``xmlError`` assert(my.kind == xmlError) result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]] proc errorMsgExpected*(my: TXmlParser, tag: string): string = ## returns an error message " expected" in the same format as the ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag] proc errorMsg*(my: TXmlParser, msg: string): string = ## returns an error message with text `msg` in the same format as the ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), msg] proc markError(my: var TXmlParser, kind: TXmlError) {.inline.} = my.err = kind my.state = stateError proc parseCDATA(my: var TXMLParser) = var pos = my.bufpos + len("': inc(pos, 3) break add(my.a, ']') inc(pos) of '\0': markError(my, errEndOfCDataExpected) break of '\c': pos = lexbase.HandleCR(my, pos) buf = my.buf add(my.a, '\L') of '\L': pos = lexbase.HandleLF(my, pos) buf = my.buf add(my.a, '\L') else: add(my.a, buf[pos]) inc(pos) my.bufpos = pos # store back my.kind = xmlCDATA proc parseComment(my: var TXMLParser) = var pos = my.bufpos + len("" % x.charData) of xmlPI: echo("" % [x.PIName, x.PIRest]) of xmlElementStart: echo("<$1>" % x.elementName) of xmlElementEnd: echo("" % x.elementName) of xmlElementOpen: echo("<$1" % x.elementName) of xmlAttribute: echo("Key: " & x.attrKey) echo("Value: " & x.attrValue) of xmlElementClose: echo(">") of xmlCData: echo("" % x.charData) of xmlEntity: echo("&$1;" % x.entityName) of xmlSpecial: echo("SPECIAL: " & x.charData) close(x)