diff options
author | Araq <rumpf_a@web.de> | 2015-07-01 15:47:15 +0200 |
---|---|---|
committer | Araq <rumpf_a@web.de> | 2015-07-01 15:47:15 +0200 |
commit | 0d7e0e1b4fb9e99259eb9f2a1ad42a7c0136e48b (patch) | |
tree | f5dda64458a2a7f3f2b438a530aea2aedb74e773 | |
parent | 13259c669dc1dcdd72c88aedfe8689fd333288d3 (diff) | |
download | Nim-0d7e0e1b4fb9e99259eb9f2a1ad42a7c0136e48b.tar.gz |
fixes #2429
-rw-r--r-- | lib/pure/lexbase.nim | 62 | ||||
-rw-r--r-- | lib/pure/parsexml.nim | 272 |
2 files changed, 178 insertions, 156 deletions
diff --git a/lib/pure/lexbase.nim b/lib/pure/lexbase.nim index 585ba87f5..bfecf6a58 100644 --- a/lib/pure/lexbase.nim +++ b/lib/pure/lexbase.nim @@ -34,37 +34,15 @@ type lineNumber*: int ## the current line number sentinel: int lineStart: int # index of last line start in buffer - fileOpened: bool + refillChars: set[char] {.deprecated: [TBaseLexer: BaseLexer].} -proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192) - ## inits the BaseLexer with a stream to read from - -proc close*(L: var BaseLexer) - ## closes the base lexer. This closes `L`'s associated stream too. - -proc getCurrentLine*(L: BaseLexer, marker: bool = true): string - ## retrieves the current line. - -proc getColNumber*(L: BaseLexer, pos: int): int - ## retrieves the current column. - -proc handleCR*(L: var BaseLexer, pos: int): int - ## Call this if you scanned over '\c' in the buffer; it returns the the - ## position to continue the scanning from. `pos` must be the position - ## of the '\c'. -proc handleLF*(L: var BaseLexer, pos: int): int - ## Call this if you scanned over '\L' in the buffer; it returns the the - ## position to continue the scanning from. `pos` must be the position - ## of the '\L'. - -# implementation - const chrSize = sizeof(char) -proc close(L: var BaseLexer) = +proc close*(L: var BaseLexer) = + ## closes the base lexer. This closes `L`'s associated stream too. dealloc(L.buf) close(L.input) @@ -80,7 +58,7 @@ proc fillBuffer(L: var BaseLexer) = toCopy = L.bufLen - L.sentinel - 1 assert(toCopy >= 0) if toCopy > 0: - moveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize) + moveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize) # "moveMem" handles overlapping regions charsRead = readData(L.input, addr(L.buf[toCopy]), (L.sentinel + 1) * chrSize) div chrSize @@ -93,7 +71,7 @@ proc fillBuffer(L: var BaseLexer) = dec(s) # BUGFIX (valgrind) while true: assert(s < L.bufLen) - while (s >= 0) and not (L.buf[s] in NewLines): dec(s) + while s >= 0 and L.buf[s] notin L.refillChars: dec(s) if s >= 0: # we found an appropriate character for a sentinel: L.sentinel = s @@ -121,31 +99,46 @@ proc fillBaseLexer(L: var BaseLexer, pos: int): int = fillBuffer(L) L.bufpos = 0 # XXX: is this really correct? result = 0 - L.lineStart = result -proc handleCR(L: var BaseLexer, pos: int): int = +proc handleCR*(L: var BaseLexer, pos: int): int = + ## Call this if you scanned over '\c' in the buffer; it returns the the + ## position to continue the scanning from. `pos` must be the position + ## of the '\c'. assert(L.buf[pos] == '\c') inc(L.lineNumber) result = fillBaseLexer(L, pos) if L.buf[result] == '\L': result = fillBaseLexer(L, result) + L.lineStart = result -proc handleLF(L: var BaseLexer, pos: int): int = +proc handleLF*(L: var BaseLexer, pos: int): int = + ## Call this if you scanned over '\L' in the buffer; it returns the the + ## position to continue the scanning from. `pos` must be the position + ## of the '\L'. assert(L.buf[pos] == '\L') inc(L.lineNumber) result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result; + L.lineStart = result + +proc handleRefillChar*(L: var BaseLexer, pos: int): int = + ## To be documented. + assert(L.buf[pos] in L.refillChars) + result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result; proc skipUtf8Bom(L: var BaseLexer) = if (L.buf[0] == '\xEF') and (L.buf[1] == '\xBB') and (L.buf[2] == '\xBF'): inc(L.bufpos, 3) inc(L.lineStart, 3) -proc open(L: var BaseLexer, input: Stream, bufLen: int = 8192) = +proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192; + refillChars: set[char] = NewLines) = + ## inits the BaseLexer with a stream to read from. assert(bufLen > 0) assert(input != nil) L.input = input L.bufpos = 0 L.bufLen = bufLen + L.refillChars = refillChars L.buf = cast[cstring](alloc(bufLen * chrSize)) L.sentinel = bufLen - 1 L.lineStart = 0 @@ -153,10 +146,12 @@ proc open(L: var BaseLexer, input: Stream, bufLen: int = 8192) = fillBuffer(L) skipUtf8Bom(L) -proc getColNumber(L: BaseLexer, pos: int): int = +proc getColNumber*(L: BaseLexer, pos: int): int = + ## retrieves the current column. result = abs(pos - L.lineStart) -proc getCurrentLine(L: BaseLexer, marker: bool = true): string = +proc getCurrentLine*(L: BaseLexer, marker: bool = true): string = + ## retrieves the current line. var i: int result = "" i = L.lineStart @@ -166,4 +161,3 @@ proc getCurrentLine(L: BaseLexer, marker: bool = true): string = add(result, "\n") if marker: add(result, spaces(getColNumber(L, L.bufpos)) & "^\n") - diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim index eb792f086..e1abb0a4f 100644 --- a/lib/pure/parsexml.nim +++ b/lib/pure/parsexml.nim @@ -8,19 +8,19 @@ # ## This module implements a simple high performance `XML`:idx: / `HTML`:idx: -## parser. +## parser. ## The only encoding that is supported is UTF-8. The parser has been designed -## to be somewhat error correcting, so that even most "wild HTML" found on the +## to be somewhat error correcting, so that even most "wild HTML" found on the ## web can be parsed with it. **Note:** This parser does not check that each -## ``<tag>`` has a corresponding ``</tag>``! These checks have do be -## implemented by the client code for various reasons: +## ``<tag>`` has a corresponding ``</tag>``! These checks have do be +## implemented by the client code for various reasons: ## ## * Old HTML contains tags that have no end tag: ``<br>`` for example. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this ## library can parse both, only the client knows which comparison is to be ## used. ## * Thus the checks would have been very difficult to implement properly with -## little benefit, especially since they are simple to implement in the +## little benefit, especially since they are simple to implement in the ## client. The client should use the `errorMsgExpected` proc to generate ## a nice error message that fits the other error messages this library ## creates. @@ -29,7 +29,7 @@ ## Example 1: Retrieve HTML title ## ============================== ## -## The file ``examples/htmltitle.nim`` demonstrates how to use the +## The file ``examples/htmltitle.nim`` demonstrates how to use the ## XML parser to accomplish a simple task: To determine the title of an HTML ## document. ## @@ -40,22 +40,22 @@ ## Example 2: Retrieve all HTML links ## ================================== ## -## The file ``examples/htmlrefs.nim`` demonstrates how to use the -## XML parser to accomplish another simple task: To determine all the links +## The file ``examples/htmlrefs.nim`` demonstrates how to use the +## XML parser to accomplish another simple task: To determine all the links ## an HTML document contains. ## ## .. code-block:: nim ## :file: examples/htmlrefs.nim ## -import +import hashes, strutils, lexbase, streams, unicode # the parser treats ``<br />`` as ``<br></br>`` -# xmlElementCloseEnd, ## ``/>`` +# xmlElementCloseEnd, ## ``/>`` -type +type XmlEventKind* = enum ## enumation of all events that may occur when parsing xmlError, ## an error occurred during parsing xmlEof, ## end of file reached @@ -65,13 +65,13 @@ type xmlPI, ## processing instruction (``<?name something ?>``) xmlElementStart, ## ``<elem>`` xmlElementEnd, ## ``</elem>`` - xmlElementOpen, ## ``<elem + xmlElementOpen, ## ``<elem xmlAttribute, ## ``key = "value"`` pair - xmlElementClose, ## ``>`` + xmlElementClose, ## ``>`` xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` xmlEntity, ## &entity; xmlSpecial ## ``<! ... data ... >`` - + XmlErrorKind* = enum ## enumeration that lists all errors that can occur errNone, ## no error errEndOfCDataExpected, ## ``]]>`` expected @@ -82,8 +82,8 @@ type errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected ## ``-->`` expected - - ParserState = enum + + ParserState = enum stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError XmlParseOption* = enum ## options for the XML parser @@ -121,8 +121,8 @@ proc open*(my: var XmlParser, input: Stream, filename: string, ## the `options` parameter: If `options` contains ``reportWhitespace`` ## a whitespace token is reported as an ``xmlWhitespace`` event. ## If `options` contains ``reportComments`` a comment token is reported as an - ## ``xmlComment`` event. - lexbase.open(my, input) + ## ``xmlComment`` event. + lexbase.open(my, input, 8192, {'\c', '\L', '/'}) my.filename = filename my.state = stateStart my.kind = xmlError @@ -130,24 +130,24 @@ proc open*(my: var XmlParser, input: Stream, filename: string, my.b = "" my.c = nil my.options = options - -proc close*(my: var XmlParser) {.inline.} = + +proc close*(my: var XmlParser) {.inline.} = ## closes the parser `my` and its associated input stream. lexbase.close(my) -proc kind*(my: XmlParser): XmlEventKind {.inline.} = +proc kind*(my: XmlParser): XmlEventKind {.inline.} = ## returns the current event type for the XML parser return my.kind template charData*(my: XmlParser): string = - ## returns the character data for the events: ``xmlCharData``, + ## returns the character data for the events: ``xmlCharData``, ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial`` - assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, + assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial}) my.a template elementName*(my: XmlParser): string = - ## returns the element name for the events: ``xmlElementStart``, + ## returns the element name for the events: ``xmlElementStart``, ## ``xmlElementEnd``, ``xmlElementOpen`` assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen}) my.a @@ -156,12 +156,12 @@ template entityName*(my: XmlParser): string = ## returns the entity name for the event: ``xmlEntity`` assert(my.kind == xmlEntity) my.a - + template attrKey*(my: XmlParser): string = ## returns the attribute key for the event ``xmlAttribute`` assert(my.kind == xmlAttribute) my.a - + template attrValue*(my: XmlParser): string = ## returns the attribute value for the event ``xmlAttribute`` assert(my.kind == xmlAttribute) @@ -187,110 +187,118 @@ proc rawData2*(my: XmlParser): string {.inline.} = ## This is only used for speed hacks. shallowCopy(result, my.b) -proc getColumn*(my: XmlParser): int {.inline.} = +proc getColumn*(my: XmlParser): int {.inline.} = ## get the current column the parser has arrived at. result = getColNumber(my, my.bufpos) -proc getLine*(my: XmlParser): int {.inline.} = +proc getLine*(my: XmlParser): int {.inline.} = ## get the current line the parser has arrived at. result = my.lineNumber -proc getFilename*(my: XmlParser): string {.inline.} = +proc getFilename*(my: XmlParser): string {.inline.} = ## get the filename of the file that the parser processes. result = my.filename - -proc errorMsg*(my: XmlParser): string = + +proc errorMsg*(my: XmlParser): string = ## returns a helpful error message for the event ``xmlError`` assert(my.kind == xmlError) result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]] -proc errorMsgExpected*(my: XmlParser, tag: string): string = +proc errorMsgExpected*(my: XmlParser, tag: string): string = ## returns an error message "<tag> expected" in the same format as the - ## other error messages + ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag] -proc errorMsg*(my: XmlParser, msg: string): string = +proc errorMsg*(my: XmlParser, msg: string): string = ## returns an error message with text `msg` in the same format as the - ## other error messages + ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), msg] - -proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = + +proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = my.err = kind my.state = stateError -proc parseCDATA(my: var XmlParser) = +proc parseCDATA(my: var XmlParser) = var pos = my.bufpos + len("<![CDATA[") var buf = my.buf while true: - case buf[pos] + case buf[pos] of ']': if buf[pos+1] == ']' and buf[pos+2] == '>': inc(pos, 3) break add(my.a, ']') inc(pos) - of '\0': + of '\0': markError(my, errEndOfCDataExpected) break - of '\c': + of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.a, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + add(my.a, '/') else: add(my.a, buf[pos]) - inc(pos) + inc(pos) my.bufpos = pos # store back my.kind = xmlCData -proc parseComment(my: var XmlParser) = +proc parseComment(my: var XmlParser) = var pos = my.bufpos + len("<!--") var buf = my.buf while true: - case buf[pos] + case buf[pos] of '-': if buf[pos+1] == '-' and buf[pos+2] == '>': inc(pos, 3) break if my.options.contains(reportComments): add(my.a, '-') inc(pos) - of '\0': + of '\0': markError(my, errEndOfCommentExpected) break - of '\c': + of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf if my.options.contains(reportComments): add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf if my.options.contains(reportComments): add(my.a, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + if my.options.contains(reportComments): add(my.a, '/') else: if my.options.contains(reportComments): add(my.a, buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlComment -proc parseWhitespace(my: var XmlParser, skip=false) = +proc parseWhitespace(my: var XmlParser, skip=false) = var pos = my.bufpos var buf = my.buf - while true: + while true: case buf[pos] - of ' ', '\t': + of ' ', '\t': if not skip: add(my.a, buf[pos]) inc(pos) - of '\c': + of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.handleCR(my, pos) buf = my.buf if not skip: add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf if not skip: add(my.a, '\L') @@ -302,10 +310,10 @@ const NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'} NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'} -proc parseName(my: var XmlParser, dest: var string) = +proc parseName(my: var XmlParser, dest: var string) = var pos = my.bufpos var buf = my.buf - if buf[pos] in NameStartChar: + if buf[pos] in NameStartChar: while true: add(dest, buf[pos]) inc(pos) @@ -314,14 +322,14 @@ proc parseName(my: var XmlParser, dest: var string) = else: markError(my, errNameExpected) -proc parseEntity(my: var XmlParser, dest: var string) = +proc parseEntity(my: var XmlParser, dest: var string) = var pos = my.bufpos+1 var buf = my.buf my.kind = xmlCharData if buf[pos] == '#': var r: int inc(pos) - if buf[pos] == 'x': + if buf[pos] == 'x': inc(pos) while true: case buf[pos] @@ -331,7 +339,7 @@ proc parseEntity(my: var XmlParser, dest: var string) = else: break inc(pos) else: - while buf[pos] in {'0'..'9'}: + while buf[pos] in {'0'..'9'}: r = r * 10 + (ord(buf[pos]) - ord('0')) inc(pos) add(dest, toUTF8(Rune(r))) @@ -345,11 +353,11 @@ proc parseEntity(my: var XmlParser, dest: var string) = buf[pos+3] == ';': add(dest, '&') inc(pos, 3) - elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and + elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and buf[pos+3] == 's' and buf[pos+4] == ';': add(dest, '\'') inc(pos, 4) - elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and + elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and buf[pos+3] == 't' and buf[pos+4] == ';': add(dest, '"') inc(pos, 4) @@ -357,23 +365,23 @@ proc parseEntity(my: var XmlParser, dest: var string) = my.bufpos = pos parseName(my, dest) pos = my.bufpos - if my.err != errNameExpected: + if my.err != errNameExpected: my.kind = xmlEntity else: add(dest, '&') - if buf[pos] == ';': + if buf[pos] == ';': inc(pos) else: markError(my, errSemicolonExpected) my.bufpos = pos -proc parsePI(my: var XmlParser) = +proc parsePI(my: var XmlParser) = inc(my.bufpos, "<?".len) parseName(my, my.a) var pos = my.bufpos var buf = my.buf setLen(my.b, 0) - while true: + while true: case buf[pos] of '\0': markError(my, errQmGtExpected) @@ -387,29 +395,33 @@ proc parsePI(my: var XmlParser) = of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.handleCR(my, pos) - buf = my.buf + buf = my.buf add(my.b, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.b, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + add(my.b, '/') else: add(my.b, buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlPI -proc parseSpecial(my: var XmlParser) = +proc parseSpecial(my: var XmlParser) = # things that start with <! var pos = my.bufpos + 2 var buf = my.buf var opentags = 0 - while true: + while true: case buf[pos] of '\0': markError(my, errGtExpected) break - of '<': + of '<': inc(opentags) inc(pos) add(my.a, '<') @@ -420,47 +432,55 @@ proc parseSpecial(my: var XmlParser) = dec(opentags) inc(pos) add(my.a, '>') - of '\c': + of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.a, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + add(my.b, '/') else: add(my.a, buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlSpecial -proc parseTag(my: var XmlParser) = +proc parseTag(my: var XmlParser) = inc(my.bufpos) parseName(my, my.a) # if we have no name, do not interpret the '<': - if my.a.len == 0: + if my.a.len == 0: my.kind = xmlCharData add(my.a, '<') return parseWhitespace(my, skip=true) - if my.buf[my.bufpos] in NameStartChar: + if my.buf[my.bufpos] in NameStartChar: # an attribute follows: my.kind = xmlElementOpen my.state = stateAttr my.c = my.a # save for later else: my.kind = xmlElementStart - if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>': - inc(my.bufpos, 2) + let slash = my.buf[my.bufpos] == '/' + if slash: + my.bufpos = lexbase.handleRefillChar(my, my.bufpos) + if slash and my.buf[my.bufpos] == '>': + inc(my.bufpos) my.state = stateEmptyElementTag my.c = nil elif my.buf[my.bufpos] == '>': - inc(my.bufpos) + inc(my.bufpos) else: markError(my, errGtExpected) - -proc parseEndTag(my: var XmlParser) = - inc(my.bufpos, 2) + +proc parseEndTag(my: var XmlParser) = + my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1) + #inc(my.bufpos, 2) parseName(my, my.a) parseWhitespace(my, skip=true) if my.buf[my.bufpos] == '>': @@ -469,13 +489,13 @@ proc parseEndTag(my: var XmlParser) = markError(my, errGtExpected) my.kind = xmlElementEnd -proc parseAttribute(my: var XmlParser) = +proc parseAttribute(my: var XmlParser) = my.kind = xmlAttribute setLen(my.a, 0) setLen(my.b, 0) parseName(my, my.a) # if we have no name, we have '<tag attr= key %&$$%': - if my.a.len == 0: + if my.a.len == 0: markError(my, errGtExpected) return parseWhitespace(my, skip=true) @@ -491,27 +511,27 @@ proc parseAttribute(my: var XmlParser) = var quote = buf[pos] var pendingSpace = false inc(pos) - while true: + while true: case buf[pos] of '\0': markError(my, errQuoteExpected) break - of '&': - if pendingSpace: + of '&': + if pendingSpace: add(my.b, ' ') pendingSpace = false my.bufpos = pos parseEntity(my, my.b) my.kind = xmlAttribute # parseEntity overwrites my.kind! pos = my.bufpos - of ' ', '\t': + of ' ', '\t': pendingSpace = true inc(pos) - of '\c': + of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf pendingSpace = true - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf pendingSpace = true @@ -520,44 +540,48 @@ proc parseAttribute(my: var XmlParser) = inc(pos) break else: - if pendingSpace: + if pendingSpace: add(my.b, ' ') pendingSpace = false add(my.b, buf[pos]) inc(pos) else: - markError(my, errQuoteExpected) + markError(my, errQuoteExpected) my.bufpos = pos parseWhitespace(my, skip=true) - -proc parseCharData(my: var XmlParser) = + +proc parseCharData(my: var XmlParser) = var pos = my.bufpos var buf = my.buf - while true: + while true: case buf[pos] of '\0', '<', '&': break - of '\c': + of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.handleCR(my, pos) buf = my.buf add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.a, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + add(my.a, '/') else: add(my.a, buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlCharData -proc rawGetTok(my: var XmlParser) = +proc rawGetTok(my: var XmlParser) = my.kind = xmlError setLen(my.a, 0) var pos = my.bufpos var buf = my.buf case buf[pos] - of '<': + of '<': case buf[pos+1] of '/': parseEndTag(my) @@ -566,44 +590,44 @@ proc rawGetTok(my: var XmlParser) = buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and buf[pos+8] == '[': parseCDATA(my) - elif buf[pos+2] == '-' and buf[pos+3] == '-': + elif buf[pos+2] == '-' and buf[pos+3] == '-': parseComment(my) - else: + else: parseSpecial(my) of '?': parsePI(my) - else: + else: parseTag(my) - of ' ', '\t', '\c', '\l': + of ' ', '\t', '\c', '\l': parseWhitespace(my) my.kind = xmlWhitespace - of '\0': + of '\0': my.kind = xmlEof of '&': parseEntity(my, my.a) - else: + else: parseCharData(my) assert my.kind != xmlError - -proc getTok(my: var XmlParser) = + +proc getTok(my: var XmlParser) = while true: rawGetTok(my) case my.kind - of xmlComment: + of xmlComment: if my.options.contains(reportComments): break - of xmlWhitespace: + of xmlWhitespace: if my.options.contains(reportWhitespace): break else: break - -proc next*(my: var XmlParser) = + +proc next*(my: var XmlParser) = ## retrieves the first/next event. This controls the parser. case my.state of stateNormal: - getTok(my) + getTok(my) of stateStart: my.state = stateNormal getTok(my) - if my.kind == xmlPI and my.a == "xml": + if my.kind == xmlPI and my.a == "xml": # just skip the first ``<?xml >`` processing instruction getTok(my) of stateAttr: @@ -612,10 +636,14 @@ proc next*(my: var XmlParser) = my.kind = xmlElementClose inc(my.bufpos) my.state = stateNormal - elif my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>': - my.kind = xmlElementClose - inc(my.bufpos, 2) - my.state = stateEmptyElementTag + elif my.buf[my.bufpos] == '/': + my.bufpos = lexbase.handleRefillChar(my, my.bufpos) + if my.buf[my.bufpos] == '>': + my.kind = xmlElementClose + inc(my.bufpos) + my.state = stateEmptyElementTag + else: + markError(my, errGtExpected) else: parseAttribute(my) # state remains the same @@ -624,10 +652,10 @@ proc next*(my: var XmlParser) = my.kind = xmlElementEnd if not my.c.isNil: my.a = my.c - of stateError: + of stateError: my.kind = xmlError my.state = stateNormal - + when not defined(testing) and isMainModule: import os var s = newFileStream(paramStr(1), fmRead) @@ -645,13 +673,13 @@ when not defined(testing) and isMainModule: of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest]) of xmlElementStart: echo("<$1>" % x.elementName) of xmlElementEnd: echo("</$1>" % x.elementName) - - of xmlElementOpen: echo("<$1" % x.elementName) - of xmlAttribute: + + of xmlElementOpen: echo("<$1" % x.elementName) + of xmlAttribute: echo("Key: " & x.attrKey) echo("Value: " & x.attrValue) - - of xmlElementClose: echo(">") + + of xmlElementClose: echo(">") of xmlCData: echo("<![CDATA[$1]]>" % x.charData) of xmlEntity: |