" while x.kind == xmlCharData: title.add(x.charData) x.next() if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0: echo("Title: " & title) quit(0) # Success! else: echo(x.errorMsgExpected("/title")) of xmlEof: break # end of file reached else: discard # ignore other events x.close() quit("Could not determine title!") ]## ##[ Example 2: Retrieve all HTML links ================================== The file ``examples/htmlrefs.nim`` demonstrates how to use the XML parser to accomplish another simple task: To determine all the links an HTML document contains. .. code-block:: nim # Example program to show the new parsexml module # This program reads an HTML file and writes all its used links to stdout. # Errors and whitespace are ignored. import os, streams, parsexml, strutils proc `=?=` (a, b: string): bool = # little trick: define our own comparator that ignores case return cmpIgnoreCase(a, b) == 0 if paramCount() < 1: quit("Usage: htmlrefs filename[.html]") var links = 0 # count the number of links var filename = addFileExt(paramStr(1), "html") var s = newFileStream(filename, fmRead) if s == nil: quit("cannot open the file " & filename) var x: XmlParser open(x, s, filename) next(x) # get first event block mainLoop: while true: case x.kind of xmlElementOpen: # the <a href = "xyz"> tag we are interested in always has an attribute, # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart`` if x.elementName =?= "a": x.next() if x.kind == xmlAttribute: if x.attrKey =?= "href": var link = x.attrValue inc(links) # skip until we have an ``xmlElementClose`` event while true: x.next() case x.kind of xmlEof: break mainLoop of xmlElementClose: break else: discard x.next() # skip ``xmlElementClose`` # now we have the description for the ``a`` element var desc = "" while x.kind == xmlCharData: desc.add(x.charData) x.next() echo(desc & ": " & link) else: x.next() of xmlEof: break # end of file reached of xmlError: echo(errorMsg(x)) x.next() else: x.next() # skip other events echo($links & " link(s) found!") x.close() ]## import hashes, strutils, lexbase, streams, unicode # the parser treats `` `` as `` `` # xmlElementCloseEnd, ## ``/>`` type XmlEventKind* = enum ## enumation of all events that may occur when parsing xmlError, ## an error occurred during parsing xmlEof, ## end of file reached xmlCharData, ## character data xmlWhitespace, ## whitespace has been parsed xmlComment, ## a comment has been parsed xmlPI, ## processing instruction (``<?name something ?>``) xmlElementStart, ## ``<elem>`` xmlElementEnd, ## ``</elem>`` xmlElementOpen, ## ``<elem xmlAttribute, ## ``key = "value"`` pair xmlElementClose, ## ``>`` xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` xmlEntity, ## &entity; xmlSpecial ## ``<! ... data ... >`` XmlErrorKind* = enum ## enumeration that lists all errors that can occur errNone, ## no error errEndOfCDataExpected, ## ``]]>`` expected errNameExpected, ## name expected errSemicolonExpected, ## ``;`` expected errQmGtExpected, ## ``?>`` expected errGtExpected, ## ``>`` expected errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected ## ``-->`` expected ParserState = enum stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError XmlParseOption* = enum ## options for the XML parser reportWhitespace, ## report whitespace reportComments ## report comments XmlParser* = object of BaseLexer ## the parser object. a, b, c: string kind: XmlEventKind err: XmlErrorKind state: ParserState cIsEmpty: bool filename: string options: set[XmlParseOption] const errorMessages: array[XmlErrorKind, string] = [ "no error", "']]>' expected", "name expected", "';' expected", "'?>' expected", "'>' expected", "'=' expected", "'\"' or \"'\" expected", "'-->' expected" ] proc open*(my: var XmlParser, input: Stream, filename: string, options: set[XmlParseOption] = {}) = ## initializes the parser with an input stream. `Filename` is only used ## for nice error messages. The parser's behaviour can be controlled by ## the `options` parameter: If `options` contains ``reportWhitespace`` ## a whitespace token is reported as an ``xmlWhitespace`` event. ## If `options` contains ``reportComments`` a comment token is reported as an ## ``xmlComment`` event. lexbase.open(my, input, 8192, {'\c', '\L', '/'}) my.filename = filename my.state = stateStart my.kind = xmlError my.a = "" my.b = "" my.c = "" my.cIsEmpty = true my.options = options proc close*(my: var XmlParser) {.inline.} = ## closes the parser `my` and its associated input stream. lexbase.close(my) proc kind*(my: XmlParser): XmlEventKind {.inline.} = ## returns the current event type for the XML parser return my.kind template charData*(my: XmlParser): string = ## returns the character data for the events: ``xmlCharData``, ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial`` ## Raises an assertion in debug mode if ``my.kind`` is not one ## of those events. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial}) my.a template elementName*(my: XmlParser): string = ## returns the element name for the events: ``xmlElementStart``, ## ``xmlElementEnd``, ``xmlElementOpen`` ## Raises an assertion in debug mode if ``my.kind`` is not one ## of those events. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen}) my.a template entityName*(my: XmlParser): string = ## returns the entity name for the event: ``xmlEntity`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlEntity``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlEntity) my.a template attrKey*(my: XmlParser): string = ## returns the attribute key for the event ``xmlAttribute`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlAttribute``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlAttribute) my.a template attrValue*(my: XmlParser): string = ## returns the attribute value for the event ``xmlAttribute`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlAttribute``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlAttribute) my.b template piName*(my: XmlParser): string = ## returns the processing instruction name for the event ``xmlPI`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlPI``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlPI) my.a template piRest*(my: XmlParser): string = ## returns the rest of the processing instruction for the event ``xmlPI`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlPI``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlPI) my.b proc rawData*(my: XmlParser): string {.inline.} = ## returns the underlying 'data' string by reference. ## This is only used for speed hacks. shallowCopy(result, my.a) proc rawData2*(my: XmlParser): string {.inline.} = ## returns the underlying second 'data' string by reference. ## This is only used for speed hacks. shallowCopy(result, my.b) proc getColumn*(my: XmlParser): int {.inline.} = ## get the current column the parser has arrived at. result = getColNumber(my, my.bufpos) proc getLine*(my: XmlParser): int {.inline.} = ## get the current line the parser has arrived at. result = my.lineNumber proc getFilename*(my: XmlParser): string {.inline.} = ## get the filename of the file that the parser processes. result = my.filename proc errorMsg*(my: XmlParser): string = ## returns a helpful error message for the event ``xmlError`` assert(my.kind == xmlError) result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]] proc errorMsgExpected*(my: XmlParser, tag: string): string = ## returns an error message "<tag> expected" in the same format as the ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag] proc errorMsg*(my: XmlParser, msg: string): string = ## returns an error message with text `msg` in the same format as the ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), msg] proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = my.err = kind my.state = stateError proc parseCDATA(my: var XmlParser) = var pos = my.bufpos + len("<![CDATA[") var buf = my.buf while true: case buf[pos] of ']': if buf[pos+1] == ']' and buf[pos+2] == '>': inc(pos, 3) break add(my.a, ']') inc(pos) of '\0': markError(my, errEndOfCDataExpected) break of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf add(my.a, '\L') of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.a, '\L') of '/': pos = lexbase.handleRefillChar(my, pos) buf = my.buf add(my.a, '/') else: add(my.a, buf[pos]) inc(pos) my.bufpos = pos # store back my.kind = xmlCData proc parseComment(my: var XmlParser) = var pos = my.bufpos + len("" % x.charData) of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest]) of xmlElementStart: echo("<$1>" % x.elementName) of xmlElementEnd: echo("</$1>" % x.elementName) of xmlElementOpen: echo("<$1" % x.elementName) of xmlAttribute: echo("Key: " & x.attrKey) echo("Value: " & x.attrValue) of xmlElementClose: echo(">") of xmlCData: echo("<![CDATA[$1]]>" % x.charData) of xmlEntity: echo("&$1;" % x.entityName) of xmlSpecial: echo("SPECIAL: " & x.charData) close(x)

# # # Nim's Runtime Library # (c) Copyright 2010 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## This module implements a simple high performance `XML`:idx: / `HTML`:idx: ## parser. ## The only encoding that is supported is UTF-8. The parser has been designed ## to be somewhat error correcting, so that even most "wild HTML" found on the ## web can be parsed with it. **Note:** This parser does not check that each ## ```` has a corresponding ````! These checks have do be ## implemented by the client code for various reasons: ## ## * Old HTML contains tags that have no end tag: ``
`` for example. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this ## library can parse both, only the client knows which comparison is to be ## used. ## * Thus the checks would have been very difficult to implement properly with ## little benefit, especially since they are simple to implement in the ## client. The client should use the `errorMsgExpected` proc to generate ## a nice error message that fits the other error messages this library ## creates. ## ## ##[ Example 1: Retrieve HTML title ============================== The file ``examples/htmltitle.nim`` demonstrates how to use the XML parser to accomplish a simple task: To determine the title of an HTML document. .. code-block:: nim # Example program to show the parsexml module # This program reads an HTML file and writes its title to stdout. # Errors and whitespace are ignored. import os, streams, parsexml, strutils if paramCount() < 1: quit("Usage: htmltitle filename[.html]") var filename = addFileExt(paramStr(1), "html") var s = newFileStream(filename, fmRead) if s == nil: quit("cannot open the file " & filename) var x: XmlParser open(x, s, filename) while true: x.next() case x.kind of xmlElementStart: if cmpIgnoreCase(x.elementName, "title") == 0: var title = "" x.next() # skip "" while x.kind == xmlCharData: title.add(x.charData) x.next() if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0: echo("Title: " & title) quit(0) # Success! else: echo(x.errorMsgExpected("/title")) of xmlEof: break # end of file reached else: discard # ignore other events x.close() quit("Could not determine title!") ]## ##[ Example 2: Retrieve all HTML links ================================== The file ``examples/htmlrefs.nim`` demonstrates how to use the XML parser to accomplish another simple task: To determine all the links an HTML document contains. .. code-block:: nim # Example program to show the new parsexml module # This program reads an HTML file and writes all its used links to stdout. # Errors and whitespace are ignored. import os, streams, parsexml, strutils proc `=?=` (a, b: string): bool = # little trick: define our own comparator that ignores case return cmpIgnoreCase(a, b) == 0 if paramCount() < 1: quit("Usage: htmlrefs filename[.html]") var links = 0 # count the number of links var filename = addFileExt(paramStr(1), "html") var s = newFileStream(filename, fmRead) if s == nil: quit("cannot open the file " & filename) var x: XmlParser open(x, s, filename) next(x) # get first event block mainLoop: while true: case x.kind of xmlElementOpen: # the <a href = "xyz"> tag we are interested in always has an attribute, # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart`` if x.elementName =?= "a": x.next() if x.kind == xmlAttribute: if x.attrKey =?= "href": var link = x.attrValue inc(links) # skip until we have an ``xmlElementClose`` event while true: x.next() case x.kind of xmlEof: break mainLoop of xmlElementClose: break else: discard x.next() # skip ``xmlElementClose`` # now we have the description for the ``a`` element var desc = "" while x.kind == xmlCharData: desc.add(x.charData) x.next() echo(desc & ": " & link) else: x.next() of xmlEof: break # end of file reached of xmlError: echo(errorMsg(x)) x.next() else: x.next() # skip other events echo($links & " link(s) found!") x.close() ]## import hashes, strutils, lexbase, streams, unicode # the parser treats `` `` as `` `` # xmlElementCloseEnd, ## ``/>`` type XmlEventKind* = enum ## enumation of all events that may occur when parsing xmlError, ## an error occurred during parsing xmlEof, ## end of file reached xmlCharData, ## character data xmlWhitespace, ## whitespace has been parsed xmlComment, ## a comment has been parsed xmlPI, ## processing instruction (``<?name something ?>``) xmlElementStart, ## ``<elem>`` xmlElementEnd, ## ``</elem>`` xmlElementOpen, ## ``<elem xmlAttribute, ## ``key = "value"`` pair xmlElementClose, ## ``>`` xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` xmlEntity, ## &entity; xmlSpecial ## ``<! ... data ... >`` XmlErrorKind* = enum ## enumeration that lists all errors that can occur errNone, ## no error errEndOfCDataExpected, ## ``]]>`` expected errNameExpected, ## name expected errSemicolonExpected, ## ``;`` expected errQmGtExpected, ## ``?>`` expected errGtExpected, ## ``>`` expected errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected ## ``-->`` expected ParserState = enum stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError XmlParseOption* = enum ## options for the XML parser reportWhitespace, ## report whitespace reportComments ## report comments XmlParser* = object of BaseLexer ## the parser object. a, b, c: string kind: XmlEventKind err: XmlErrorKind state: ParserState cIsEmpty: bool filename: string options: set[XmlParseOption] const errorMessages: array[XmlErrorKind, string] = [ "no error", "']]>' expected", "name expected", "';' expected", "'?>' expected", "'>' expected", "'=' expected", "'\"' or \"'\" expected", "'-->' expected" ] proc open*(my: var XmlParser, input: Stream, filename: string, options: set[XmlParseOption] = {}) = ## initializes the parser with an input stream. `Filename` is only used ## for nice error messages. The parser's behaviour can be controlled by ## the `options` parameter: If `options` contains ``reportWhitespace`` ## a whitespace token is reported as an ``xmlWhitespace`` event. ## If `options` contains ``reportComments`` a comment token is reported as an ## ``xmlComment`` event. lexbase.open(my, input, 8192, {'\c', '\L', '/'}) my.filename = filename my.state = stateStart my.kind = xmlError my.a = "" my.b = "" my.c = "" my.cIsEmpty = true my.options = options proc close*(my: var XmlParser) {.inline.} = ## closes the parser `my` and its associated input stream. lexbase.close(my) proc kind*(my: XmlParser): XmlEventKind {.inline.} = ## returns the current event type for the XML parser return my.kind template charData*(my: XmlParser): string = ## returns the character data for the events: ``xmlCharData``, ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial`` ## Raises an assertion in debug mode if ``my.kind`` is not one ## of those events. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial}) my.a template elementName*(my: XmlParser): string = ## returns the element name for the events: ``xmlElementStart``, ## ``xmlElementEnd``, ``xmlElementOpen`` ## Raises an assertion in debug mode if ``my.kind`` is not one ## of those events. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen}) my.a template entityName*(my: XmlParser): string = ## returns the entity name for the event: ``xmlEntity`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlEntity``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlEntity) my.a template attrKey*(my: XmlParser): string = ## returns the attribute key for the event ``xmlAttribute`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlAttribute``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlAttribute) my.a template attrValue*(my: XmlParser): string = ## returns the attribute value for the event ``xmlAttribute`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlAttribute``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlAttribute) my.b template piName*(my: XmlParser): string = ## returns the processing instruction name for the event ``xmlPI`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlPI``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlPI) my.a template piRest*(my: XmlParser): string = ## returns the rest of the processing instruction for the event ``xmlPI`` ## Raises an assertion in debug mode if ``my.kind`` is not ## ``xmlPI``. In release mode, this will not trigger an error ## but the value returned will not be valid. assert(my.kind == xmlPI) my.b proc rawData*(my: XmlParser): string {.inline.} = ## returns the underlying 'data' string by reference. ## This is only used for speed hacks. shallowCopy(result, my.a) proc rawData2*(my: XmlParser): string {.inline.} = ## returns the underlying second 'data' string by reference. ## This is only used for speed hacks. shallowCopy(result, my.b) proc getColumn*(my: XmlParser): int {.inline.} = ## get the current column the parser has arrived at. result = getColNumber(my, my.bufpos) proc getLine*(my: XmlParser): int {.inline.} = ## get the current line the parser has arrived at. result = my.lineNumber proc getFilename*(my: XmlParser): string {.inline.} = ## get the filename of the file that the parser processes. result = my.filename proc errorMsg*(my: XmlParser): string = ## returns a helpful error message for the event ``xmlError`` assert(my.kind == xmlError) result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]] proc errorMsgExpected*(my: XmlParser, tag: string): string = ## returns an error message "<tag> expected" in the same format as the ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag] proc errorMsg*(my: XmlParser, msg: string): string = ## returns an error message with text `msg` in the same format as the ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), msg] proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = my.err = kind my.state = stateError proc parseCDATA(my: var XmlParser) = var pos = my.bufpos + len("<![CDATA[") var buf = my.buf while true: case buf[pos] of ']': if buf[pos+1] == ']' and buf[pos+2] == '>': inc(pos, 3) break add(my.a, ']') inc(pos) of '\0': markError(my, errEndOfCDataExpected) break of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf add(my.a, '\L') of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.a, '\L') of '/': pos = lexbase.handleRefillChar(my, pos) buf = my.buf add(my.a, '/') else: add(my.a, buf[pos]) inc(pos) my.bufpos = pos # store back my.kind = xmlCData proc parseComment(my: var XmlParser) = var pos = my.bufpos + len("" % x.charData) of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest]) of xmlElementStart: echo("<$1>" % x.elementName) of xmlElementEnd: echo("</$1>" % x.elementName) of xmlElementOpen: echo("<$1" % x.elementName) of xmlAttribute: echo("Key: " & x.attrKey) echo("Value: " & x.attrValue) of xmlElementClose: echo(">") of xmlCData: echo("<![CDATA[$1]]>" % x.charData) of xmlEntity: echo("&$1;" % x.entityName) of xmlSpecial: echo("SPECIAL: " & x.charData) close(x)