diff options
author | Araq <rumpf_a@web.de> | 2015-07-01 15:47:15 +0200 |
---|---|---|
committer | Araq <rumpf_a@web.de> | 2015-07-01 15:47:15 +0200 |
commit | 0d7e0e1b4fb9e99259eb9f2a1ad42a7c0136e48b (patch) | |
tree | f5dda64458a2a7f3f2b438a530aea2aedb74e773 /lib/pure/parsexml.nim | |
parent | 13259c669dc1dcdd72c88aedfe8689fd333288d3 (diff) | |
download | Nim-0d7e0e1b4fb9e99259eb9f2a1ad42a7c0136e48b.tar.gz |
fixes #2429
Diffstat (limited to 'lib/pure/parsexml.nim')
-rw-r--r-- | lib/pure/parsexml.nim | 272 |
1 files changed, 150 insertions, 122 deletions
diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim index eb792f086..e1abb0a4f 100644 --- a/lib/pure/parsexml.nim +++ b/lib/pure/parsexml.nim @@ -8,19 +8,19 @@ # ## This module implements a simple high performance `XML`:idx: / `HTML`:idx: -## parser. +## parser. ## The only encoding that is supported is UTF-8. The parser has been designed -## to be somewhat error correcting, so that even most "wild HTML" found on the +## to be somewhat error correcting, so that even most "wild HTML" found on the ## web can be parsed with it. **Note:** This parser does not check that each -## ``<tag>`` has a corresponding ``</tag>``! These checks have do be -## implemented by the client code for various reasons: +## ``<tag>`` has a corresponding ``</tag>``! These checks have do be +## implemented by the client code for various reasons: ## ## * Old HTML contains tags that have no end tag: ``<br>`` for example. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this ## library can parse both, only the client knows which comparison is to be ## used. ## * Thus the checks would have been very difficult to implement properly with -## little benefit, especially since they are simple to implement in the +## little benefit, especially since they are simple to implement in the ## client. The client should use the `errorMsgExpected` proc to generate ## a nice error message that fits the other error messages this library ## creates. @@ -29,7 +29,7 @@ ## Example 1: Retrieve HTML title ## ============================== ## -## The file ``examples/htmltitle.nim`` demonstrates how to use the +## The file ``examples/htmltitle.nim`` demonstrates how to use the ## XML parser to accomplish a simple task: To determine the title of an HTML ## document. ## @@ -40,22 +40,22 @@ ## Example 2: Retrieve all HTML links ## ================================== ## -## The file ``examples/htmlrefs.nim`` demonstrates how to use the -## XML parser to accomplish another simple task: To determine all the links +## The file ``examples/htmlrefs.nim`` demonstrates how to use the +## XML parser to accomplish another simple task: To determine all the links ## an HTML document contains. ## ## .. code-block:: nim ## :file: examples/htmlrefs.nim ## -import +import hashes, strutils, lexbase, streams, unicode # the parser treats ``<br />`` as ``<br></br>`` -# xmlElementCloseEnd, ## ``/>`` +# xmlElementCloseEnd, ## ``/>`` -type +type XmlEventKind* = enum ## enumation of all events that may occur when parsing xmlError, ## an error occurred during parsing xmlEof, ## end of file reached @@ -65,13 +65,13 @@ type xmlPI, ## processing instruction (``<?name something ?>``) xmlElementStart, ## ``<elem>`` xmlElementEnd, ## ``</elem>`` - xmlElementOpen, ## ``<elem + xmlElementOpen, ## ``<elem xmlAttribute, ## ``key = "value"`` pair - xmlElementClose, ## ``>`` + xmlElementClose, ## ``>`` xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` xmlEntity, ## &entity; xmlSpecial ## ``<! ... data ... >`` - + XmlErrorKind* = enum ## enumeration that lists all errors that can occur errNone, ## no error errEndOfCDataExpected, ## ``]]>`` expected @@ -82,8 +82,8 @@ type errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected ## ``-->`` expected - - ParserState = enum + + ParserState = enum stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError XmlParseOption* = enum ## options for the XML parser @@ -121,8 +121,8 @@ proc open*(my: var XmlParser, input: Stream, filename: string, ## the `options` parameter: If `options` contains ``reportWhitespace`` ## a whitespace token is reported as an ``xmlWhitespace`` event. ## If `options` contains ``reportComments`` a comment token is reported as an - ## ``xmlComment`` event. - lexbase.open(my, input) + ## ``xmlComment`` event. + lexbase.open(my, input, 8192, {'\c', '\L', '/'}) my.filename = filename my.state = stateStart my.kind = xmlError @@ -130,24 +130,24 @@ proc open*(my: var XmlParser, input: Stream, filename: string, my.b = "" my.c = nil my.options = options - -proc close*(my: var XmlParser) {.inline.} = + +proc close*(my: var XmlParser) {.inline.} = ## closes the parser `my` and its associated input stream. lexbase.close(my) -proc kind*(my: XmlParser): XmlEventKind {.inline.} = +proc kind*(my: XmlParser): XmlEventKind {.inline.} = ## returns the current event type for the XML parser return my.kind template charData*(my: XmlParser): string = - ## returns the character data for the events: ``xmlCharData``, + ## returns the character data for the events: ``xmlCharData``, ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial`` - assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, + assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, xmlSpecial}) my.a template elementName*(my: XmlParser): string = - ## returns the element name for the events: ``xmlElementStart``, + ## returns the element name for the events: ``xmlElementStart``, ## ``xmlElementEnd``, ``xmlElementOpen`` assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen}) my.a @@ -156,12 +156,12 @@ template entityName*(my: XmlParser): string = ## returns the entity name for the event: ``xmlEntity`` assert(my.kind == xmlEntity) my.a - + template attrKey*(my: XmlParser): string = ## returns the attribute key for the event ``xmlAttribute`` assert(my.kind == xmlAttribute) my.a - + template attrValue*(my: XmlParser): string = ## returns the attribute value for the event ``xmlAttribute`` assert(my.kind == xmlAttribute) @@ -187,110 +187,118 @@ proc rawData2*(my: XmlParser): string {.inline.} = ## This is only used for speed hacks. shallowCopy(result, my.b) -proc getColumn*(my: XmlParser): int {.inline.} = +proc getColumn*(my: XmlParser): int {.inline.} = ## get the current column the parser has arrived at. result = getColNumber(my, my.bufpos) -proc getLine*(my: XmlParser): int {.inline.} = +proc getLine*(my: XmlParser): int {.inline.} = ## get the current line the parser has arrived at. result = my.lineNumber -proc getFilename*(my: XmlParser): string {.inline.} = +proc getFilename*(my: XmlParser): string {.inline.} = ## get the filename of the file that the parser processes. result = my.filename - -proc errorMsg*(my: XmlParser): string = + +proc errorMsg*(my: XmlParser): string = ## returns a helpful error message for the event ``xmlError`` assert(my.kind == xmlError) result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]] -proc errorMsgExpected*(my: XmlParser, tag: string): string = +proc errorMsgExpected*(my: XmlParser, tag: string): string = ## returns an error message "<tag> expected" in the same format as the - ## other error messages + ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag] -proc errorMsg*(my: XmlParser, msg: string): string = +proc errorMsg*(my: XmlParser, msg: string): string = ## returns an error message with text `msg` in the same format as the - ## other error messages + ## other error messages result = "$1($2, $3) Error: $4" % [ my.filename, $getLine(my), $getColumn(my), msg] - -proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = + +proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = my.err = kind my.state = stateError -proc parseCDATA(my: var XmlParser) = +proc parseCDATA(my: var XmlParser) = var pos = my.bufpos + len("<![CDATA[") var buf = my.buf while true: - case buf[pos] + case buf[pos] of ']': if buf[pos+1] == ']' and buf[pos+2] == '>': inc(pos, 3) break add(my.a, ']') inc(pos) - of '\0': + of '\0': markError(my, errEndOfCDataExpected) break - of '\c': + of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.a, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + add(my.a, '/') else: add(my.a, buf[pos]) - inc(pos) + inc(pos) my.bufpos = pos # store back my.kind = xmlCData -proc parseComment(my: var XmlParser) = +proc parseComment(my: var XmlParser) = var pos = my.bufpos + len("<!--") var buf = my.buf while true: - case buf[pos] + case buf[pos] of '-': if buf[pos+1] == '-' and buf[pos+2] == '>': inc(pos, 3) break if my.options.contains(reportComments): add(my.a, '-') inc(pos) - of '\0': + of '\0': markError(my, errEndOfCommentExpected) break - of '\c': + of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf if my.options.contains(reportComments): add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf if my.options.contains(reportComments): add(my.a, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + if my.options.contains(reportComments): add(my.a, '/') else: if my.options.contains(reportComments): add(my.a, buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlComment -proc parseWhitespace(my: var XmlParser, skip=false) = +proc parseWhitespace(my: var XmlParser, skip=false) = var pos = my.bufpos var buf = my.buf - while true: + while true: case buf[pos] - of ' ', '\t': + of ' ', '\t': if not skip: add(my.a, buf[pos]) inc(pos) - of '\c': + of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.handleCR(my, pos) buf = my.buf if not skip: add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf if not skip: add(my.a, '\L') @@ -302,10 +310,10 @@ const NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'} NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'} -proc parseName(my: var XmlParser, dest: var string) = +proc parseName(my: var XmlParser, dest: var string) = var pos = my.bufpos var buf = my.buf - if buf[pos] in NameStartChar: + if buf[pos] in NameStartChar: while true: add(dest, buf[pos]) inc(pos) @@ -314,14 +322,14 @@ proc parseName(my: var XmlParser, dest: var string) = else: markError(my, errNameExpected) -proc parseEntity(my: var XmlParser, dest: var string) = +proc parseEntity(my: var XmlParser, dest: var string) = var pos = my.bufpos+1 var buf = my.buf my.kind = xmlCharData if buf[pos] == '#': var r: int inc(pos) - if buf[pos] == 'x': + if buf[pos] == 'x': inc(pos) while true: case buf[pos] @@ -331,7 +339,7 @@ proc parseEntity(my: var XmlParser, dest: var string) = else: break inc(pos) else: - while buf[pos] in {'0'..'9'}: + while buf[pos] in {'0'..'9'}: r = r * 10 + (ord(buf[pos]) - ord('0')) inc(pos) add(dest, toUTF8(Rune(r))) @@ -345,11 +353,11 @@ proc parseEntity(my: var XmlParser, dest: var string) = buf[pos+3] == ';': add(dest, '&') inc(pos, 3) - elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and + elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and buf[pos+3] == 's' and buf[pos+4] == ';': add(dest, '\'') inc(pos, 4) - elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and + elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and buf[pos+3] == 't' and buf[pos+4] == ';': add(dest, '"') inc(pos, 4) @@ -357,23 +365,23 @@ proc parseEntity(my: var XmlParser, dest: var string) = my.bufpos = pos parseName(my, dest) pos = my.bufpos - if my.err != errNameExpected: + if my.err != errNameExpected: my.kind = xmlEntity else: add(dest, '&') - if buf[pos] == ';': + if buf[pos] == ';': inc(pos) else: markError(my, errSemicolonExpected) my.bufpos = pos -proc parsePI(my: var XmlParser) = +proc parsePI(my: var XmlParser) = inc(my.bufpos, "<?".len) parseName(my, my.a) var pos = my.bufpos var buf = my.buf setLen(my.b, 0) - while true: + while true: case buf[pos] of '\0': markError(my, errQmGtExpected) @@ -387,29 +395,33 @@ proc parsePI(my: var XmlParser) = of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.handleCR(my, pos) - buf = my.buf + buf = my.buf add(my.b, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.b, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + add(my.b, '/') else: add(my.b, buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlPI -proc parseSpecial(my: var XmlParser) = +proc parseSpecial(my: var XmlParser) = # things that start with <! var pos = my.bufpos + 2 var buf = my.buf var opentags = 0 - while true: + while true: case buf[pos] of '\0': markError(my, errGtExpected) break - of '<': + of '<': inc(opentags) inc(pos) add(my.a, '<') @@ -420,47 +432,55 @@ proc parseSpecial(my: var XmlParser) = dec(opentags) inc(pos) add(my.a, '>') - of '\c': + of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.a, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + add(my.b, '/') else: add(my.a, buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlSpecial -proc parseTag(my: var XmlParser) = +proc parseTag(my: var XmlParser) = inc(my.bufpos) parseName(my, my.a) # if we have no name, do not interpret the '<': - if my.a.len == 0: + if my.a.len == 0: my.kind = xmlCharData add(my.a, '<') return parseWhitespace(my, skip=true) - if my.buf[my.bufpos] in NameStartChar: + if my.buf[my.bufpos] in NameStartChar: # an attribute follows: my.kind = xmlElementOpen my.state = stateAttr my.c = my.a # save for later else: my.kind = xmlElementStart - if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>': - inc(my.bufpos, 2) + let slash = my.buf[my.bufpos] == '/' + if slash: + my.bufpos = lexbase.handleRefillChar(my, my.bufpos) + if slash and my.buf[my.bufpos] == '>': + inc(my.bufpos) my.state = stateEmptyElementTag my.c = nil elif my.buf[my.bufpos] == '>': - inc(my.bufpos) + inc(my.bufpos) else: markError(my, errGtExpected) - -proc parseEndTag(my: var XmlParser) = - inc(my.bufpos, 2) + +proc parseEndTag(my: var XmlParser) = + my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1) + #inc(my.bufpos, 2) parseName(my, my.a) parseWhitespace(my, skip=true) if my.buf[my.bufpos] == '>': @@ -469,13 +489,13 @@ proc parseEndTag(my: var XmlParser) = markError(my, errGtExpected) my.kind = xmlElementEnd -proc parseAttribute(my: var XmlParser) = +proc parseAttribute(my: var XmlParser) = my.kind = xmlAttribute setLen(my.a, 0) setLen(my.b, 0) parseName(my, my.a) # if we have no name, we have '<tag attr= key %&$$%': - if my.a.len == 0: + if my.a.len == 0: markError(my, errGtExpected) return parseWhitespace(my, skip=true) @@ -491,27 +511,27 @@ proc parseAttribute(my: var XmlParser) = var quote = buf[pos] var pendingSpace = false inc(pos) - while true: + while true: case buf[pos] of '\0': markError(my, errQuoteExpected) break - of '&': - if pendingSpace: + of '&': + if pendingSpace: add(my.b, ' ') pendingSpace = false my.bufpos = pos parseEntity(my, my.b) my.kind = xmlAttribute # parseEntity overwrites my.kind! pos = my.bufpos - of ' ', '\t': + of ' ', '\t': pendingSpace = true inc(pos) - of '\c': + of '\c': pos = lexbase.handleCR(my, pos) buf = my.buf pendingSpace = true - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf pendingSpace = true @@ -520,44 +540,48 @@ proc parseAttribute(my: var XmlParser) = inc(pos) break else: - if pendingSpace: + if pendingSpace: add(my.b, ' ') pendingSpace = false add(my.b, buf[pos]) inc(pos) else: - markError(my, errQuoteExpected) + markError(my, errQuoteExpected) my.bufpos = pos parseWhitespace(my, skip=true) - -proc parseCharData(my: var XmlParser) = + +proc parseCharData(my: var XmlParser) = var pos = my.bufpos var buf = my.buf - while true: + while true: case buf[pos] of '\0', '<', '&': break - of '\c': + of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.handleCR(my, pos) buf = my.buf add(my.a, '\L') - of '\L': + of '\L': pos = lexbase.handleLF(my, pos) buf = my.buf add(my.a, '\L') + of '/': + pos = lexbase.handleRefillChar(my, pos) + buf = my.buf + add(my.a, '/') else: add(my.a, buf[pos]) inc(pos) my.bufpos = pos my.kind = xmlCharData -proc rawGetTok(my: var XmlParser) = +proc rawGetTok(my: var XmlParser) = my.kind = xmlError setLen(my.a, 0) var pos = my.bufpos var buf = my.buf case buf[pos] - of '<': + of '<': case buf[pos+1] of '/': parseEndTag(my) @@ -566,44 +590,44 @@ proc rawGetTok(my: var XmlParser) = buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and buf[pos+8] == '[': parseCDATA(my) - elif buf[pos+2] == '-' and buf[pos+3] == '-': + elif buf[pos+2] == '-' and buf[pos+3] == '-': parseComment(my) - else: + else: parseSpecial(my) of '?': parsePI(my) - else: + else: parseTag(my) - of ' ', '\t', '\c', '\l': + of ' ', '\t', '\c', '\l': parseWhitespace(my) my.kind = xmlWhitespace - of '\0': + of '\0': my.kind = xmlEof of '&': parseEntity(my, my.a) - else: + else: parseCharData(my) assert my.kind != xmlError - -proc getTok(my: var XmlParser) = + +proc getTok(my: var XmlParser) = while true: rawGetTok(my) case my.kind - of xmlComment: + of xmlComment: if my.options.contains(reportComments): break - of xmlWhitespace: + of xmlWhitespace: if my.options.contains(reportWhitespace): break else: break - -proc next*(my: var XmlParser) = + +proc next*(my: var XmlParser) = ## retrieves the first/next event. This controls the parser. case my.state of stateNormal: - getTok(my) + getTok(my) of stateStart: my.state = stateNormal getTok(my) - if my.kind == xmlPI and my.a == "xml": + if my.kind == xmlPI and my.a == "xml": # just skip the first ``<?xml >`` processing instruction getTok(my) of stateAttr: @@ -612,10 +636,14 @@ proc next*(my: var XmlParser) = my.kind = xmlElementClose inc(my.bufpos) my.state = stateNormal - elif my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>': - my.kind = xmlElementClose - inc(my.bufpos, 2) - my.state = stateEmptyElementTag + elif my.buf[my.bufpos] == '/': + my.bufpos = lexbase.handleRefillChar(my, my.bufpos) + if my.buf[my.bufpos] == '>': + my.kind = xmlElementClose + inc(my.bufpos) + my.state = stateEmptyElementTag + else: + markError(my, errGtExpected) else: parseAttribute(my) # state remains the same @@ -624,10 +652,10 @@ proc next*(my: var XmlParser) = my.kind = xmlElementEnd if not my.c.isNil: my.a = my.c - of stateError: + of stateError: my.kind = xmlError my.state = stateNormal - + when not defined(testing) and isMainModule: import os var s = newFileStream(paramStr(1), fmRead) @@ -645,13 +673,13 @@ when not defined(testing) and isMainModule: of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest]) of xmlElementStart: echo("<$1>" % x.elementName) of xmlElementEnd: echo("</$1>" % x.elementName) - - of xmlElementOpen: echo("<$1" % x.elementName) - of xmlAttribute: + + of xmlElementOpen: echo("<$1" % x.elementName) + of xmlAttribute: echo("Key: " & x.attrKey) echo("Value: " & x.attrValue) - - of xmlElementClose: echo(">") + + of xmlElementClose: echo(">") of xmlCData: echo("<![CDATA[$1]]>" % x.charData) of xmlEntity: |