diff options
Diffstat (limited to 'lib/pure/parsexml.nim')
-rw-r--r-- | lib/pure/parsexml.nim | 432 |
1 files changed, 281 insertions, 151 deletions
diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim index f8b2c3d8d..c760799a2 100644 --- a/lib/pure/parsexml.nim +++ b/lib/pure/parsexml.nim @@ -26,81 +26,183 @@ ## creates. ## ## -## Example 1: Retrieve HTML title -## ============================== -## -## The file ``examples/htmltitle.nim`` demonstrates how to use the -## XML parser to accomplish a simple task: To determine the title of an HTML -## document. -## -## .. code-block:: nim -## :file: examples/htmltitle.nim -## -## -## Example 2: Retrieve all HTML links -## ================================== -## -## The file ``examples/htmlrefs.nim`` demonstrates how to use the -## XML parser to accomplish another simple task: To determine all the links -## an HTML document contains. -## -## .. code-block:: nim -## :file: examples/htmlrefs.nim -## + +##[ + +Example 1: Retrieve HTML title +============================== + +The file ``examples/htmltitle.nim`` demonstrates how to use the +XML parser to accomplish a simple task: To determine the title of an HTML +document. + + ```nim + # Example program to show the parsexml module + # This program reads an HTML file and writes its title to stdout. + # Errors and whitespace are ignored. + + import std/[os, streams, parsexml, strutils] + + if paramCount() < 1: + quit("Usage: htmltitle filename[.html]") + + var filename = addFileExt(paramStr(1), "html") + var s = newFileStream(filename, fmRead) + if s == nil: quit("cannot open the file " & filename) + var x: XmlParser + open(x, s, filename) + while true: + x.next() + case x.kind + of xmlElementStart: + if cmpIgnoreCase(x.elementName, "title") == 0: + var title = "" + x.next() # skip "<title>" + while x.kind == xmlCharData: + title.add(x.charData) + x.next() + if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0: + echo("Title: " & title) + quit(0) # Success! + else: + echo(x.errorMsgExpected("/title")) + + of xmlEof: break # end of file reached + else: discard # ignore other events + + x.close() + quit("Could not determine title!") + ``` + +]## + +##[ + +Example 2: Retrieve all HTML links +================================== + +The file ``examples/htmlrefs.nim`` demonstrates how to use the +XML parser to accomplish another simple task: To determine all the links +an HTML document contains. + + ```nim + # Example program to show the new parsexml module + # This program reads an HTML file and writes all its used links to stdout. + # Errors and whitespace are ignored. + + import std/[os, streams, parsexml, strutils] + + proc `=?=` (a, b: string): bool = + # little trick: define our own comparator that ignores case + return cmpIgnoreCase(a, b) == 0 + + if paramCount() < 1: + quit("Usage: htmlrefs filename[.html]") + + var links = 0 # count the number of links + var filename = addFileExt(paramStr(1), "html") + var s = newFileStream(filename, fmRead) + if s == nil: quit("cannot open the file " & filename) + var x: XmlParser + open(x, s, filename) + next(x) # get first event + block mainLoop: + while true: + case x.kind + of xmlElementOpen: + # the <a href = "xyz"> tag we are interested in always has an attribute, + # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart`` + if x.elementName =?= "a": + x.next() + if x.kind == xmlAttribute: + if x.attrKey =?= "href": + var link = x.attrValue + inc(links) + # skip until we have an ``xmlElementClose`` event + while true: + x.next() + case x.kind + of xmlEof: break mainLoop + of xmlElementClose: break + else: discard + x.next() # skip ``xmlElementClose`` + # now we have the description for the ``a`` element + var desc = "" + while x.kind == xmlCharData: + desc.add(x.charData) + x.next() + echo(desc & ": " & link) + else: + x.next() + of xmlEof: break # end of file reached + of xmlError: + echo(errorMsg(x)) + x.next() + else: x.next() # skip other events + + echo($links & " link(s) found!") + x.close() + ``` + +]## import - hashes, strutils, lexbase, streams, unicode + std/[strutils, lexbase, streams, unicode] + +when defined(nimPreviewSlimSystem): + import std/[assertions, syncio] # the parser treats ``<br />`` as ``<br></br>`` # xmlElementCloseEnd, ## ``/>`` type - XmlEventKind* = enum ## enumation of all events that may occur when parsing - xmlError, ## an error occurred during parsing - xmlEof, ## end of file reached - xmlCharData, ## character data - xmlWhitespace, ## whitespace has been parsed - xmlComment, ## a comment has been parsed - xmlPI, ## processing instruction (``<?name something ?>``) - xmlElementStart, ## ``<elem>`` - xmlElementEnd, ## ``</elem>`` - xmlElementOpen, ## ``<elem - xmlAttribute, ## ``key = "value"`` pair - xmlElementClose, ## ``>`` - xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` - xmlEntity, ## &entity; - xmlSpecial ## ``<! ... data ... >`` - - XmlErrorKind* = enum ## enumeration that lists all errors that can occur - errNone, ## no error - errEndOfCDataExpected, ## ``]]>`` expected - errNameExpected, ## name expected - errSemicolonExpected, ## ``;`` expected - errQmGtExpected, ## ``?>`` expected - errGtExpected, ## ``>`` expected - errEqExpected, ## ``=`` expected - errQuoteExpected, ## ``"`` or ``'`` expected - errEndOfCommentExpected ## ``-->`` expected + XmlEventKind* = enum ## enumeration of all events that may occur when parsing + xmlError, ## an error occurred during parsing + xmlEof, ## end of file reached + xmlCharData, ## character data + xmlWhitespace, ## whitespace has been parsed + xmlComment, ## a comment has been parsed + xmlPI, ## processing instruction (``<?name something ?>``) + xmlElementStart, ## ``<elem>`` + xmlElementEnd, ## ``</elem>`` + xmlElementOpen, ## ``<elem + xmlAttribute, ## ``key = "value"`` pair + xmlElementClose, ## ``>`` + xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` + xmlEntity, ## &entity; + xmlSpecial ## ``<! ... data ... >`` + + XmlErrorKind* = enum ## enumeration that lists all errors that can occur + errNone, ## no error + errEndOfCDataExpected, ## ``]]>`` expected + errNameExpected, ## name expected + errSemicolonExpected, ## ``;`` expected + errQmGtExpected, ## ``?>`` expected + errGtExpected, ## ``>`` expected + errEqExpected, ## ``=`` expected + errQuoteExpected, ## ``"`` or ``'`` expected + errEndOfCommentExpected ## ``-->`` expected + errAttributeValueExpected ## non-empty attribute value expected ParserState = enum stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError - XmlParseOption* = enum ## options for the XML parser - reportWhitespace, ## report whitespace - reportComments ## report comments + XmlParseOption* = enum ## options for the XML parser + reportWhitespace, ## report whitespace + reportComments ## report comments + allowUnquotedAttribs ## allow unquoted attribute values (for HTML) + allowEmptyAttribs ## allow empty attributes (without explicit value) XmlParser* = object of BaseLexer ## the parser object. a, b, c: string kind: XmlEventKind err: XmlErrorKind state: ParserState + cIsEmpty: bool filename: string options: set[XmlParseOption] -{.deprecated: [TXmlParser: XmlParser, TXmlParseOptions: XmlParseOption, - TXmlError: XmlErrorKind, TXmlEventKind: XmlEventKind].} - const errorMessages: array[XmlErrorKind, string] = [ "no error", @@ -111,7 +213,8 @@ const "'>' expected", "'=' expected", "'\"' or \"'\" expected", - "'-->' expected" + "'-->' expected", + "attribute value expected" ] proc open*(my: var XmlParser, input: Stream, filename: string, @@ -128,7 +231,8 @@ proc open*(my: var XmlParser, input: Stream, filename: string, my.kind = xmlError my.a = "" my.b = "" - my.c = nil + my.c = "" + my.cIsEmpty = true my.options = options proc close*(my: var XmlParser) {.inline.} = @@ -142,6 +246,9 @@ proc kind*(my: XmlParser): XmlEventKind {.inline.} = template charData*(my: XmlParser): string = ## returns the character data for the events: ``xmlCharData``, ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial`` + ## Raises an assertion in debug mode if ``my.kind`` is not one + ## of those events. In release mode, this will not trigger an error + ## but the value returned will not be valid. assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial}) my.a @@ -149,43 +256,61 @@ template charData*(my: XmlParser): string = template elementName*(my: XmlParser): string = ## returns the element name for the events: ``xmlElementStart``, ## ``xmlElementEnd``, ``xmlElementOpen`` + ## Raises an assertion in debug mode if ``my.kind`` is not one + ## of those events. In release mode, this will not trigger an error + ## but the value returned will not be valid. assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen}) my.a template entityName*(my: XmlParser): string = ## returns the entity name for the event: ``xmlEntity`` + ## Raises an assertion in debug mode if ``my.kind`` is not + ## ``xmlEntity``. In release mode, this will not trigger an error + ## but the value returned will not be valid. assert(my.kind == xmlEntity) my.a template attrKey*(my: XmlParser): string = ## returns the attribute key for the event ``xmlAttribute`` + ## Raises an assertion in debug mode if ``my.kind`` is not + ## ``xmlAttribute``. In release mode, this will not trigger an error + ## but the value returned will not be valid. assert(my.kind == xmlAttribute) my.a template attrValue*(my: XmlParser): string = ## returns the attribute value for the event ``xmlAttribute`` + ## Raises an assertion in debug mode if ``my.kind`` is not + ## ``xmlAttribute``. In release mode, this will not trigger an error + ## but the value returned will not be valid. assert(my.kind == xmlAttribute) my.b template piName*(my: XmlParser): string = ## returns the processing instruction name for the event ``xmlPI`` + ## Raises an assertion in debug mode if ``my.kind`` is not + ## ``xmlPI``. In release mode, this will not trigger an error + ## but the value returned will not be valid. assert(my.kind == xmlPI) my.a template piRest*(my: XmlParser): string = ## returns the rest of the processing instruction for the event ``xmlPI`` + ## Raises an assertion in debug mode if ``my.kind`` is not + ## ``xmlPI``. In release mode, this will not trigger an error + ## but the value returned will not be valid. assert(my.kind == xmlPI) my.b -proc rawData*(my: XmlParser): string {.inline.} = +proc rawData*(my: var XmlParser): lent string {.inline.} = ## returns the underlying 'data' string by reference. ## This is only used for speed hacks. - shallowCopy(result, my.a) + result = my.a -proc rawData2*(my: XmlParser): string {.inline.} = +proc rawData2*(my: var XmlParser): lent string {.inline.} = ## returns the underlying second 'data' string by reference. ## This is only used for speed hacks. - shallowCopy(result, my.b) + result = my.b proc getColumn*(my: XmlParser): int {.inline.} = ## get the current column the parser has arrived at. @@ -223,11 +348,10 @@ proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = proc parseCDATA(my: var XmlParser) = var pos = my.bufpos + len("<![CDATA[") - var buf = my.buf while true: - case buf[pos] + case my.buf[pos] of ']': - if buf[pos+1] == ']' and buf[pos+2] == '>': + if my.buf[pos+1] == ']' and my.buf[pos+2] == '>': inc(pos, 3) break add(my.a, ']') @@ -237,29 +361,25 @@ proc parseCDATA(my: var XmlParser) = break of '\c': pos = lexbase.handleCR(my, pos) - buf = my.buf add(my.a, '\L') of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf add(my.a, '\L') of '/': pos = lexbase.handleRefillChar(my, pos) - buf = my.buf add(my.a, '/') else: - add(my.a, buf[pos]) + add(my.a, my.buf[pos]) inc(pos) my.bufpos = pos # store back my.kind = xmlCData proc parseComment(my: var XmlParser) = var pos = my.bufpos + len("<!--") - var buf = my.buf while true: - case buf[pos] + case my.buf[pos] of '-': - if buf[pos+1] == '-' and buf[pos+2] == '>': + if my.buf[pos+1] == '-' and my.buf[pos+2] == '>': inc(pos, 3) break if my.options.contains(reportComments): add(my.a, '-') @@ -269,38 +389,32 @@ proc parseComment(my: var XmlParser) = break of '\c': pos = lexbase.handleCR(my, pos) - buf = my.buf if my.options.contains(reportComments): add(my.a, '\L') of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf if my.options.contains(reportComments): add(my.a, '\L') of '/': pos = lexbase.handleRefillChar(my, pos) - buf = my.buf if my.options.contains(reportComments): add(my.a, '/') else: - if my.options.contains(reportComments): add(my.a, buf[pos]) + if my.options.contains(reportComments): add(my.a, my.buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlComment -proc parseWhitespace(my: var XmlParser, skip=false) = +proc parseWhitespace(my: var XmlParser, skip = false) = var pos = my.bufpos - var buf = my.buf while true: - case buf[pos] + case my.buf[pos] of ' ', '\t': - if not skip: add(my.a, buf[pos]) + if not skip: add(my.a, my.buf[pos]) inc(pos) of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.handleCR(my, pos) - buf = my.buf if not skip: add(my.a, '\L') of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf if not skip: add(my.a, '\L') else: break @@ -312,82 +426,82 @@ const proc parseName(my: var XmlParser, dest: var string) = var pos = my.bufpos - var buf = my.buf - if buf[pos] in NameStartChar: + if my.buf[pos] in NameStartChar: while true: - add(dest, buf[pos]) + add(dest, my.buf[pos]) inc(pos) - if buf[pos] notin NameChar: break + if my.buf[pos] notin NameChar: break my.bufpos = pos else: markError(my, errNameExpected) proc parseEntity(my: var XmlParser, dest: var string) = var pos = my.bufpos+1 - var buf = my.buf my.kind = xmlCharData - if buf[pos] == '#': + if my.buf[pos] == '#': var r: int inc(pos) - if buf[pos] == 'x': + if my.buf[pos] == 'x': inc(pos) while true: - case buf[pos] - of '0'..'9': r = (r shl 4) or (ord(buf[pos]) - ord('0')) - of 'a'..'f': r = (r shl 4) or (ord(buf[pos]) - ord('a') + 10) - of 'A'..'F': r = (r shl 4) or (ord(buf[pos]) - ord('A') + 10) + case my.buf[pos] + of '0'..'9': r = (r shl 4) or (ord(my.buf[pos]) - ord('0')) + of 'a'..'f': r = (r shl 4) or (ord(my.buf[pos]) - ord('a') + 10) + of 'A'..'F': r = (r shl 4) or (ord(my.buf[pos]) - ord('A') + 10) else: break inc(pos) else: - while buf[pos] in {'0'..'9'}: - r = r * 10 + (ord(buf[pos]) - ord('0')) + while my.buf[pos] in {'0'..'9'}: + r = r * 10 + (ord(my.buf[pos]) - ord('0')) inc(pos) add(dest, toUTF8(Rune(r))) - elif buf[pos] == 'l' and buf[pos+1] == 't' and buf[pos+2] == ';': + elif my.buf[pos] == 'l' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';': add(dest, '<') inc(pos, 2) - elif buf[pos] == 'g' and buf[pos+1] == 't' and buf[pos+2] == ';': + elif my.buf[pos] == 'g' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';': add(dest, '>') inc(pos, 2) - elif buf[pos] == 'a' and buf[pos+1] == 'm' and buf[pos+2] == 'p' and - buf[pos+3] == ';': + elif my.buf[pos] == 'a' and my.buf[pos+1] == 'm' and my.buf[pos+2] == 'p' and + my.buf[pos+3] == ';': add(dest, '&') inc(pos, 3) - elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and - buf[pos+3] == 's' and buf[pos+4] == ';': + elif my.buf[pos] == 'a' and my.buf[pos+1] == 'p' and my.buf[pos+2] == 'o' and + my.buf[pos+3] == 's' and my.buf[pos+4] == ';': add(dest, '\'') inc(pos, 4) - elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and - buf[pos+3] == 't' and buf[pos+4] == ';': + elif my.buf[pos] == 'q' and my.buf[pos+1] == 'u' and my.buf[pos+2] == 'o' and + my.buf[pos+3] == 't' and my.buf[pos+4] == ';': add(dest, '"') inc(pos, 4) else: my.bufpos = pos - parseName(my, dest) + var name = "" + parseName(my, name) pos = my.bufpos - if my.err != errNameExpected: + if my.err != errNameExpected and my.buf[pos] == ';': my.kind = xmlEntity else: add(dest, '&') - if buf[pos] == ';': + add(dest, name) + if my.buf[pos] == ';': inc(pos) else: - markError(my, errSemicolonExpected) + my.err = errSemicolonExpected + # do not overwrite 'my.state' here, it's a benign error my.bufpos = pos proc parsePI(my: var XmlParser) = inc(my.bufpos, "<?".len) parseName(my, my.a) var pos = my.bufpos - var buf = my.buf setLen(my.b, 0) while true: - case buf[pos] + case my.buf[pos] of '\0': markError(my, errQmGtExpected) break of '?': - if buf[pos+1] == '>': + if my.buf[pos+1] == '>': inc(pos, 2) break add(my.b, '?') @@ -395,18 +509,15 @@ proc parsePI(my: var XmlParser) = of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.handleCR(my, pos) - buf = my.buf add(my.b, '\L') of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf add(my.b, '\L') of '/': pos = lexbase.handleRefillChar(my, pos) - buf = my.buf add(my.b, '/') else: - add(my.b, buf[pos]) + add(my.b, my.buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlPI @@ -414,10 +525,9 @@ proc parsePI(my: var XmlParser) = proc parseSpecial(my: var XmlParser) = # things that start with <! var pos = my.bufpos + 2 - var buf = my.buf var opentags = 0 while true: - case buf[pos] + case my.buf[pos] of '\0': markError(my, errGtExpected) break @@ -434,18 +544,15 @@ proc parseSpecial(my: var XmlParser) = add(my.a, '>') of '\c': pos = lexbase.handleCR(my, pos) - buf = my.buf add(my.a, '\L') of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf add(my.a, '\L') of '/': pos = lexbase.handleRefillChar(my, pos) - buf = my.buf add(my.b, '/') else: - add(my.a, buf[pos]) + add(my.a, my.buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlSpecial @@ -458,12 +565,13 @@ proc parseTag(my: var XmlParser) = my.kind = xmlCharData add(my.a, '<') return - parseWhitespace(my, skip=true) + parseWhitespace(my, skip = true) if my.buf[my.bufpos] in NameStartChar: # an attribute follows: my.kind = xmlElementOpen my.state = stateAttr my.c = my.a # save for later + my.cIsEmpty = false else: my.kind = xmlElementStart let slash = my.buf[my.bufpos] == '/' @@ -472,7 +580,8 @@ proc parseTag(my: var XmlParser) = if slash and my.buf[my.bufpos] == '>': inc(my.bufpos) my.state = stateEmptyElementTag - my.c = nil + my.c = "" + my.cIsEmpty = true elif my.buf[my.bufpos] == '>': inc(my.bufpos) else: @@ -482,7 +591,7 @@ proc parseEndTag(my: var XmlParser) = my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1) #inc(my.bufpos, 2) parseName(my, my.a) - parseWhitespace(my, skip=true) + parseWhitespace(my, skip = true) if my.buf[my.bufpos] == '>': inc(my.bufpos) else: @@ -498,21 +607,25 @@ proc parseAttribute(my: var XmlParser) = if my.a.len == 0: markError(my, errGtExpected) return - parseWhitespace(my, skip=true) + + let startPos = my.bufpos + parseWhitespace(my, skip = true) if my.buf[my.bufpos] != '=': - markError(my, errEqExpected) + if allowEmptyAttribs notin my.options or + (my.buf[my.bufpos] != '>' and my.bufpos == startPos): + markError(my, errEqExpected) return + inc(my.bufpos) - parseWhitespace(my, skip=true) + parseWhitespace(my, skip = true) var pos = my.bufpos - var buf = my.buf - if buf[pos] in {'\'', '"'}: - var quote = buf[pos] + if my.buf[pos] in {'\'', '"'}: + var quote = my.buf[pos] var pendingSpace = false inc(pos) while true: - case buf[pos] + case my.buf[pos] of '\0': markError(my, errQuoteExpected) break @@ -529,52 +642,67 @@ proc parseAttribute(my: var XmlParser) = inc(pos) of '\c': pos = lexbase.handleCR(my, pos) - buf = my.buf pendingSpace = true of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf pendingSpace = true of '/': pos = lexbase.handleRefillChar(my, pos) - buf = my.buf add(my.b, '/') else: - if buf[pos] == quote: + if my.buf[pos] == quote: inc(pos) break else: if pendingSpace: add(my.b, ' ') pendingSpace = false - add(my.b, buf[pos]) + add(my.b, my.buf[pos]) inc(pos) + elif allowUnquotedAttribs in my.options: + const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ', + '\0', '\t', '\L', '\F', '\f'} + let startPos = pos + while (let c = my.buf[pos]; c notin disallowedChars): + if c == '&': + my.bufpos = pos + parseEntity(my, my.b) + my.kind = xmlAttribute # parseEntity overwrites my.kind! + pos = my.bufpos + elif c == '/': + pos = lexbase.handleRefillChar(my, pos) + add(my.b, '/') + else: + add(my.b, c) + inc(pos) + if pos == startPos: + markError(my, errAttributeValueExpected) else: markError(my, errQuoteExpected) + # error corrections: guess what was meant + while my.buf[pos] != '>' and my.buf[pos] > ' ': + add(my.b, my.buf[pos]) + inc pos my.bufpos = pos - parseWhitespace(my, skip=true) + parseWhitespace(my, skip = true) proc parseCharData(my: var XmlParser) = var pos = my.bufpos - var buf = my.buf while true: - case buf[pos] + case my.buf[pos] of '\0', '<', '&': break of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.handleCR(my, pos) - buf = my.buf add(my.a, '\L') of '\L': pos = lexbase.handleLF(my, pos) - buf = my.buf add(my.a, '\L') of '/': pos = lexbase.handleRefillChar(my, pos) - buf = my.buf add(my.a, '/') else: - add(my.a, buf[pos]) + add(my.a, my.buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlCharData @@ -583,18 +711,18 @@ proc rawGetTok(my: var XmlParser) = my.kind = xmlError setLen(my.a, 0) var pos = my.bufpos - var buf = my.buf - case buf[pos] + case my.buf[pos] of '<': - case buf[pos+1] + case my.buf[pos+1] of '/': parseEndTag(my) of '!': - if buf[pos+2] == '[' and buf[pos+3] == 'C' and buf[pos+4] == 'D' and - buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and - buf[pos+8] == '[': + if my.buf[pos+2] == '[' and my.buf[pos+3] == 'C' and + my.buf[pos+4] == 'D' and my.buf[pos+5] == 'A' and + my.buf[pos+6] == 'T' and my.buf[pos+7] == 'A' and + my.buf[pos+8] == '[': parseCDATA(my) - elif buf[pos+2] == '-' and buf[pos+3] == '-': + elif my.buf[pos+2] == '-' and my.buf[pos+3] == '-': parseComment(my) else: parseSpecial(my) @@ -615,12 +743,15 @@ proc rawGetTok(my: var XmlParser) = proc getTok(my: var XmlParser) = while true: + let lastKind = my.kind rawGetTok(my) case my.kind of xmlComment: if my.options.contains(reportComments): break of xmlWhitespace: - if my.options.contains(reportWhitespace): break + if my.options.contains(reportWhitespace) or lastKind in {xmlCharData, + xmlComment, xmlEntity}: + break else: break proc next*(my: var XmlParser) = @@ -654,14 +785,14 @@ proc next*(my: var XmlParser) = of stateEmptyElementTag: my.state = stateNormal my.kind = xmlElementEnd - if not my.c.isNil: + if not my.cIsEmpty: my.a = my.c of stateError: my.kind = xmlError my.state = stateNormal when not defined(testing) and isMainModule: - import os + import std/os var s = newFileStream(paramStr(1), fmRead) if s == nil: quit("cannot open the file" & paramStr(1)) var x: XmlParser @@ -691,4 +822,3 @@ when not defined(testing) and isMainModule: of xmlSpecial: echo("SPECIAL: " & x.charData) close(x) - |