fixes #2429

author: Araq <rumpf_a@web.de> 2015-07-01 15:47:15 +0200
committer: Araq <rumpf_a@web.de> 2015-07-01 15:47:15 +0200
commit: 0d7e0e1b4fb9e99259eb9f2a1ad42a7c0136e48b (patch)
tree: f5dda64458a2a7f3f2b438a530aea2aedb74e773
parent: 13259c669dc1dcdd72c88aedfe8689fd333288d3 (diff)
download: Nim-0d7e0e1b4fb9e99259eb9f2a1ad42a7c0136e48b.tar.gz
2 files changed, 178 insertions, 156 deletions
diff --git a/lib/pure/lexbase.nim b/lib/pure/lexbase.nim
index 585ba87f5..bfecf6a58 100644
--- a/lib/pure/lexbase.nim
+++ b/lib/pure/lexbase.nim
@@ -34,37 +34,15 @@ type
     lineNumber*: int          ## the current line number
     sentinel: int
     lineStart: int            # index of last line start in buffer
-    fileOpened: bool
+    refillChars: set[char]
 
 {.deprecated: [TBaseLexer: BaseLexer].}
 
-proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192)
-  ## inits the BaseLexer with a stream to read from
-
-proc close*(L: var BaseLexer)
-  ## closes the base lexer. This closes `L`'s associated stream too.
-
-proc getCurrentLine*(L: BaseLexer, marker: bool = true): string
-  ## retrieves the current line.
-
-proc getColNumber*(L: BaseLexer, pos: int): int
-  ## retrieves the current column.
-
-proc handleCR*(L: var BaseLexer, pos: int): int
-  ## Call this if you scanned over '\c' in the buffer; it returns the the
-  ## position to continue the scanning from. `pos` must be the position
-  ## of the '\c'.
-proc handleLF*(L: var BaseLexer, pos: int): int
-  ## Call this if you scanned over '\L' in the buffer; it returns the the
-  ## position to continue the scanning from. `pos` must be the position
-  ## of the '\L'.
-
-# implementation
-
 const
   chrSize = sizeof(char)
 
-proc close(L: var BaseLexer) =
+proc close*(L: var BaseLexer) =
+  ## closes the base lexer. This closes `L`'s associated stream too.
   dealloc(L.buf)
   close(L.input)
 
@@ -80,7 +58,7 @@ proc fillBuffer(L: var BaseLexer) =
   toCopy = L.bufLen - L.sentinel - 1
   assert(toCopy >= 0)
   if toCopy > 0:
-    moveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize) 
+    moveMem(L.buf, addr(L.buf[L.sentinel + 1]), toCopy * chrSize)
     # "moveMem" handles overlapping regions
   charsRead = readData(L.input, addr(L.buf[toCopy]),
                        (L.sentinel + 1) * chrSize) div chrSize
@@ -93,7 +71,7 @@ proc fillBuffer(L: var BaseLexer) =
     dec(s)                    # BUGFIX (valgrind)
     while true:
       assert(s < L.bufLen)
-      while (s >= 0) and not (L.buf[s] in NewLines): dec(s)
+      while s >= 0 and L.buf[s] notin L.refillChars: dec(s)
       if s >= 0:
         # we found an appropriate character for a sentinel:
         L.sentinel = s
@@ -121,31 +99,46 @@ proc fillBaseLexer(L: var BaseLexer, pos: int): int =
     fillBuffer(L)
     L.bufpos = 0              # XXX: is this really correct?
     result = 0
-  L.lineStart = result
 
-proc handleCR(L: var BaseLexer, pos: int): int =
+proc handleCR*(L: var BaseLexer, pos: int): int =
+  ## Call this if you scanned over '\c' in the buffer; it returns the the
+  ## position to continue the scanning from. `pos` must be the position
+  ## of the '\c'.
   assert(L.buf[pos] == '\c')
   inc(L.lineNumber)
   result = fillBaseLexer(L, pos)
   if L.buf[result] == '\L':
     result = fillBaseLexer(L, result)
+  L.lineStart = result
 
-proc handleLF(L: var BaseLexer, pos: int): int =
+proc handleLF*(L: var BaseLexer, pos: int): int =
+  ## Call this if you scanned over '\L' in the buffer; it returns the the
+  ## position to continue the scanning from. `pos` must be the position
+  ## of the '\L'.
   assert(L.buf[pos] == '\L')
   inc(L.lineNumber)
   result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
+  L.lineStart = result
+
+proc handleRefillChar*(L: var BaseLexer, pos: int): int =
+  ## To be documented.
+  assert(L.buf[pos] in L.refillChars)
+  result = fillBaseLexer(L, pos) #L.lastNL := result-1; // BUGFIX: was: result;
 
 proc skipUtf8Bom(L: var BaseLexer) =
   if (L.buf[0] == '\xEF') and (L.buf[1] == '\xBB') and (L.buf[2] == '\xBF'):
     inc(L.bufpos, 3)
     inc(L.lineStart, 3)
 
-proc open(L: var BaseLexer, input: Stream, bufLen: int = 8192) =
+proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192;
+           refillChars: set[char] = NewLines) =
+  ## inits the BaseLexer with a stream to read from.
   assert(bufLen > 0)
   assert(input != nil)
   L.input = input
   L.bufpos = 0
   L.bufLen = bufLen
+  L.refillChars = refillChars
   L.buf = cast[cstring](alloc(bufLen * chrSize))
   L.sentinel = bufLen - 1
   L.lineStart = 0
@@ -153,10 +146,12 @@ proc open(L: var BaseLexer, input: Stream, bufLen: int = 8192) =
   fillBuffer(L)
   skipUtf8Bom(L)
 
-proc getColNumber(L: BaseLexer, pos: int): int =
+proc getColNumber*(L: BaseLexer, pos: int): int =
+  ## retrieves the current column.
   result = abs(pos - L.lineStart)
 
-proc getCurrentLine(L: BaseLexer, marker: bool = true): string =
+proc getCurrentLine*(L: BaseLexer, marker: bool = true): string =
+  ## retrieves the current line.
   var i: int
   result = ""
   i = L.lineStart
@@ -166,4 +161,3 @@ proc getCurrentLine(L: BaseLexer, marker: bool = true): string =
   add(result, "\n")
   if marker:
     add(result, spaces(getColNumber(L, L.bufpos)) & "^\n")
-
diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim
index eb792f086..e1abb0a4f 100644
--- a/lib/pure/parsexml.nim
+++ b/lib/pure/parsexml.nim
@@ -8,19 +8,19 @@
 #
 
 ## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
-## parser. 
+## parser.
 ## The only encoding that is supported is UTF-8. The parser has been designed
-## to be somewhat error correcting, so that even most "wild HTML" found on the 
+## to be somewhat error correcting, so that even most "wild HTML" found on the
 ## web can be parsed with it. **Note:** This parser does not check that each
-## ``<tag>`` has a corresponding ``</tag>``! These checks have do be 
-## implemented by the client code for various reasons: 
+## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
+## implemented by the client code for various reasons:
 ##
 ## * Old HTML contains tags that have no end tag: ``<br>`` for example.
 ## * HTML tags are case insensitive, XML tags are case sensitive. Since this
 ##   library can parse both, only the client knows which comparison is to be
 ##   used.
 ## * Thus the checks would have been very difficult to implement properly with
-##   little benefit, especially since they are simple to implement in the 
+##   little benefit, especially since they are simple to implement in the
 ##   client. The client should use the `errorMsgExpected` proc to generate
 ##   a nice error message that fits the other error messages this library
 ##   creates.
@@ -29,7 +29,7 @@
 ## Example 1: Retrieve HTML title
 ## ==============================
 ##
-## The file ``examples/htmltitle.nim`` demonstrates how to use the 
+## The file ``examples/htmltitle.nim`` demonstrates how to use the
 ## XML parser to accomplish a simple task: To determine the title of an HTML
 ## document.
 ##
@@ -40,22 +40,22 @@
 ## Example 2: Retrieve all HTML links
 ## ==================================
 ##
-## The file ``examples/htmlrefs.nim`` demonstrates how to use the 
-## XML parser to accomplish another simple task: To determine all the links 
+## The file ``examples/htmlrefs.nim`` demonstrates how to use the
+## XML parser to accomplish another simple task: To determine all the links
 ## an HTML document contains.
 ##
 ## .. code-block:: nim
 ##     :file: examples/htmlrefs.nim
 ##
 
-import 
+import
   hashes, strutils, lexbase, streams, unicode
 
 # the parser treats ``<br />`` as ``<br></br>``
 
-#  xmlElementCloseEnd, ## ``/>`` 
+#  xmlElementCloseEnd, ## ``/>``
 
-type 
+type
   XmlEventKind* = enum ## enumation of all events that may occur when parsing
     xmlError,           ## an error occurred during parsing
     xmlEof,             ## end of file reached
@@ -65,13 +65,13 @@ type
     xmlPI,              ## processing instruction (``<?name something ?>``)
     xmlElementStart,    ## ``<elem>``
     xmlElementEnd,      ## ``</elem>``
-    xmlElementOpen,     ## ``<elem 
+    xmlElementOpen,     ## ``<elem
     xmlAttribute,       ## ``key = "value"`` pair
-    xmlElementClose,    ## ``>`` 
+    xmlElementClose,    ## ``>``
     xmlCData,           ## ``<![CDATA[`` ... data ... ``]]>``
     xmlEntity,          ## &entity;
     xmlSpecial          ## ``<! ... data ... >``
-    
+
   XmlErrorKind* = enum       ## enumeration that lists all errors that can occur
     errNone,                 ## no error
     errEndOfCDataExpected,   ## ``]]>`` expected
@@ -82,8 +82,8 @@ type
     errEqExpected,           ## ``=`` expected
     errQuoteExpected,        ## ``"`` or ``'`` expected
     errEndOfCommentExpected  ## ``-->`` expected
-    
-  ParserState = enum 
+
+  ParserState = enum
     stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
 
   XmlParseOption* = enum  ## options for the XML parser
@@ -121,8 +121,8 @@ proc open*(my: var XmlParser, input: Stream, filename: string,
   ## the `options` parameter: If `options` contains ``reportWhitespace``
   ## a whitespace token is reported as an ``xmlWhitespace`` event.
   ## If `options` contains ``reportComments`` a comment token is reported as an
-  ## ``xmlComment`` event. 
-  lexbase.open(my, input)
+  ## ``xmlComment`` event.
+  lexbase.open(my, input, 8192, {'\c', '\L', '/'})
   my.filename = filename
   my.state = stateStart
   my.kind = xmlError
@@ -130,24 +130,24 @@ proc open*(my: var XmlParser, input: Stream, filename: string,
   my.b = ""
   my.c = nil
   my.options = options
-  
-proc close*(my: var XmlParser) {.inline.} = 
+
+proc close*(my: var XmlParser) {.inline.} =
   ## closes the parser `my` and its associated input stream.
   lexbase.close(my)
 
-proc kind*(my: XmlParser): XmlEventKind {.inline.} = 
+proc kind*(my: XmlParser): XmlEventKind {.inline.} =
   ## returns the current event type for the XML parser
   return my.kind
 
 template charData*(my: XmlParser): string =
-  ## returns the character data for the events: ``xmlCharData``, 
+  ## returns the character data for the events: ``xmlCharData``,
   ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
-  assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, 
+  assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
                      xmlSpecial})
   my.a
 
 template elementName*(my: XmlParser): string =
-  ## returns the element name for the events: ``xmlElementStart``, 
+  ## returns the element name for the events: ``xmlElementStart``,
   ## ``xmlElementEnd``, ``xmlElementOpen``
   assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
   my.a
@@ -156,12 +156,12 @@ template entityName*(my: XmlParser): string =
   ## returns the entity name for the event: ``xmlEntity``
   assert(my.kind == xmlEntity)
   my.a
-  
+
 template attrKey*(my: XmlParser): string =
   ## returns the attribute key for the event ``xmlAttribute``
   assert(my.kind == xmlAttribute)
   my.a
-  
+
 template attrValue*(my: XmlParser): string =
   ## returns the attribute value for the event ``xmlAttribute``
   assert(my.kind == xmlAttribute)
@@ -187,110 +187,118 @@ proc rawData2*(my: XmlParser): string {.inline.} =
   ## This is only used for speed hacks.
   shallowCopy(result, my.b)
 
-proc getColumn*(my: XmlParser): int {.inline.} = 
+proc getColumn*(my: XmlParser): int {.inline.} =
   ## get the current column the parser has arrived at.
   result = getColNumber(my, my.bufpos)
 
-proc getLine*(my: XmlParser): int {.inline.} = 
+proc getLine*(my: XmlParser): int {.inline.} =
   ## get the current line the parser has arrived at.
   result = my.lineNumber
 
-proc getFilename*(my: XmlParser): string {.inline.} = 
+proc getFilename*(my: XmlParser): string {.inline.} =
   ## get the filename of the file that the parser processes.
   result = my.filename
-  
-proc errorMsg*(my: XmlParser): string = 
+
+proc errorMsg*(my: XmlParser): string =
   ## returns a helpful error message for the event ``xmlError``
   assert(my.kind == xmlError)
   result = "$1($2, $3) Error: $4" % [
     my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
 
-proc errorMsgExpected*(my: XmlParser, tag: string): string = 
+proc errorMsgExpected*(my: XmlParser, tag: string): string =
   ## returns an error message "<tag> expected" in the same format as the
-  ## other error messages 
+  ## other error messages
   result = "$1($2, $3) Error: $4" % [
     my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
 
-proc errorMsg*(my: XmlParser, msg: string): string = 
+proc errorMsg*(my: XmlParser, msg: string): string =
   ## returns an error message with text `msg` in the same format as the
-  ## other error messages 
+  ## other error messages
   result = "$1($2, $3) Error: $4" % [
     my.filename, $getLine(my), $getColumn(my), msg]
-    
-proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = 
+
+proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
   my.err = kind
   my.state = stateError
 
-proc parseCDATA(my: var XmlParser) = 
+proc parseCDATA(my: var XmlParser) =
   var pos = my.bufpos + len("<![CDATA[")
   var buf = my.buf
   while true:
-    case buf[pos] 
+    case buf[pos]
     of ']':
       if buf[pos+1] == ']' and buf[pos+2] == '>':
         inc(pos, 3)
         break
       add(my.a, ']')
       inc(pos)
-    of '\0': 
+    of '\0':
       markError(my, errEndOfCDataExpected)
       break
-    of '\c': 
+    of '\c':
       pos = lexbase.handleCR(my, pos)
       buf = my.buf
       add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
       buf = my.buf
       add(my.a, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      buf = my.buf
+      add(my.a, '/')
     else:
       add(my.a, buf[pos])
-      inc(pos)    
+      inc(pos)
   my.bufpos = pos # store back
   my.kind = xmlCData
 
-proc parseComment(my: var XmlParser) = 
+proc parseComment(my: var XmlParser) =
   var pos = my.bufpos + len("<!--")
   var buf = my.buf
   while true:
-    case buf[pos] 
+    case buf[pos]
     of '-':
       if buf[pos+1] == '-' and buf[pos+2] == '>':
         inc(pos, 3)
         break
       if my.options.contains(reportComments): add(my.a, '-')
       inc(pos)
-    of '\0': 
+    of '\0':
       markError(my, errEndOfCommentExpected)
       break
-    of '\c': 
+    of '\c':
       pos = lexbase.handleCR(my, pos)
       buf = my.buf
       if my.options.contains(reportComments): add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
       buf = my.buf
       if my.options.contains(reportComments): add(my.a, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      buf = my.buf
+      if my.options.contains(reportComments): add(my.a, '/')
     else:
       if my.options.contains(reportComments): add(my.a, buf[pos])
       inc(pos)
   my.bufpos = pos
   my.kind = xmlComment
 
-proc parseWhitespace(my: var XmlParser, skip=false) = 
+proc parseWhitespace(my: var XmlParser, skip=false) =
   var pos = my.bufpos
   var buf = my.buf
-  while true: 
+  while true:
     case buf[pos]
-    of ' ', '\t': 
+    of ' ', '\t':
       if not skip: add(my.a, buf[pos])
       inc(pos)
-    of '\c':  
+    of '\c':
       # the specification says that CR-LF, CR are to be transformed to LF
       pos = lexbase.handleCR(my, pos)
       buf = my.buf
       if not skip: add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
       buf = my.buf
       if not skip: add(my.a, '\L')
@@ -302,10 +310,10 @@ const
   NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
   NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
 
-proc parseName(my: var XmlParser, dest: var string) = 
+proc parseName(my: var XmlParser, dest: var string) =
   var pos = my.bufpos
   var buf = my.buf
-  if buf[pos] in NameStartChar: 
+  if buf[pos] in NameStartChar:
     while true:
       add(dest, buf[pos])
       inc(pos)
@@ -314,14 +322,14 @@ proc parseName(my: var XmlParser, dest: var string) =
   else:
     markError(my, errNameExpected)
 
-proc parseEntity(my: var XmlParser, dest: var string) = 
+proc parseEntity(my: var XmlParser, dest: var string) =
   var pos = my.bufpos+1
   var buf = my.buf
   my.kind = xmlCharData
   if buf[pos] == '#':
     var r: int
     inc(pos)
-    if buf[pos] == 'x': 
+    if buf[pos] == 'x':
       inc(pos)
       while true:
         case buf[pos]
@@ -331,7 +339,7 @@ proc parseEntity(my: var XmlParser, dest: var string) =
         else: break
         inc(pos)
     else:
-      while buf[pos] in {'0'..'9'}: 
+      while buf[pos] in {'0'..'9'}:
         r = r * 10 + (ord(buf[pos]) - ord('0'))
         inc(pos)
     add(dest, toUTF8(Rune(r)))
@@ -345,11 +353,11 @@ proc parseEntity(my: var XmlParser, dest: var string) =
       buf[pos+3] == ';':
     add(dest, '&')
     inc(pos, 3)
-  elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and 
+  elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and
       buf[pos+3] == 's' and buf[pos+4] == ';':
     add(dest, '\'')
     inc(pos, 4)
-  elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and 
+  elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and
       buf[pos+3] == 't' and buf[pos+4] == ';':
     add(dest, '"')
     inc(pos, 4)
@@ -357,23 +365,23 @@ proc parseEntity(my: var XmlParser, dest: var string) =
     my.bufpos = pos
     parseName(my, dest)
     pos = my.bufpos
-    if my.err != errNameExpected: 
+    if my.err != errNameExpected:
       my.kind = xmlEntity
     else:
       add(dest, '&')
-  if buf[pos] == ';': 
+  if buf[pos] == ';':
     inc(pos)
   else:
     markError(my, errSemicolonExpected)
   my.bufpos = pos
 
-proc parsePI(my: var XmlParser) = 
+proc parsePI(my: var XmlParser) =
   inc(my.bufpos, "<?".len)
   parseName(my, my.a)
   var pos = my.bufpos
   var buf = my.buf
   setLen(my.b, 0)
-  while true: 
+  while true:
     case buf[pos]
     of '\0':
       markError(my, errQmGtExpected)
@@ -387,29 +395,33 @@ proc parsePI(my: var XmlParser) =
     of '\c':
       # the specification says that CR-LF, CR are to be transformed to LF
       pos = lexbase.handleCR(my, pos)
-      buf = my.buf      
+      buf = my.buf
       add(my.b, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
       buf = my.buf
       add(my.b, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      buf = my.buf
+      add(my.b, '/')
     else:
       add(my.b, buf[pos])
       inc(pos)
   my.bufpos = pos
   my.kind = xmlPI
 
-proc parseSpecial(my: var XmlParser) = 
+proc parseSpecial(my: var XmlParser) =
   # things that start with <!
   var pos = my.bufpos + 2
   var buf = my.buf
   var opentags = 0
-  while true: 
+  while true:
     case buf[pos]
     of '\0':
       markError(my, errGtExpected)
       break
-    of '<': 
+    of '<':
       inc(opentags)
       inc(pos)
       add(my.a, '<')
@@ -420,47 +432,55 @@ proc parseSpecial(my: var XmlParser) =
       dec(opentags)
       inc(pos)
       add(my.a, '>')
-    of '\c':  
+    of '\c':
       pos = lexbase.handleCR(my, pos)
       buf = my.buf
       add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
       buf = my.buf
       add(my.a, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      buf = my.buf
+      add(my.b, '/')
     else:
       add(my.a, buf[pos])
       inc(pos)
   my.bufpos = pos
   my.kind = xmlSpecial
 
-proc parseTag(my: var XmlParser) = 
+proc parseTag(my: var XmlParser) =
   inc(my.bufpos)
   parseName(my, my.a)
   # if we have no name, do not interpret the '<':
-  if my.a.len == 0: 
+  if my.a.len == 0:
     my.kind = xmlCharData
     add(my.a, '<')
     return
   parseWhitespace(my, skip=true)
-  if my.buf[my.bufpos] in NameStartChar: 
+  if my.buf[my.bufpos] in NameStartChar:
     # an attribute follows:
     my.kind = xmlElementOpen
     my.state = stateAttr
     my.c = my.a # save for later
   else:
     my.kind = xmlElementStart
-    if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>':
-      inc(my.bufpos, 2)
+    let slash = my.buf[my.bufpos] == '/'
+    if slash:
+      my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
+    if slash and my.buf[my.bufpos] == '>':
+      inc(my.bufpos)
       my.state = stateEmptyElementTag
       my.c = nil
     elif my.buf[my.bufpos] == '>':
-      inc(my.bufpos)  
+      inc(my.bufpos)
     else:
       markError(my, errGtExpected)
-  
-proc parseEndTag(my: var XmlParser) = 
-  inc(my.bufpos, 2)
+
+proc parseEndTag(my: var XmlParser) =
+  my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1)
+  #inc(my.bufpos, 2)
   parseName(my, my.a)
   parseWhitespace(my, skip=true)
   if my.buf[my.bufpos] == '>':
@@ -469,13 +489,13 @@ proc parseEndTag(my: var XmlParser) =
     markError(my, errGtExpected)
   my.kind = xmlElementEnd
 
-proc parseAttribute(my: var XmlParser) = 
+proc parseAttribute(my: var XmlParser) =
   my.kind = xmlAttribute
   setLen(my.a, 0)
   setLen(my.b, 0)
   parseName(my, my.a)
   # if we have no name, we have '<tag attr= key %&$$%':
-  if my.a.len == 0: 
+  if my.a.len == 0:
     markError(my, errGtExpected)
     return
   parseWhitespace(my, skip=true)
@@ -491,27 +511,27 @@ proc parseAttribute(my: var XmlParser) =
     var quote = buf[pos]
     var pendingSpace = false
     inc(pos)
-    while true: 
+    while true:
       case buf[pos]
       of '\0':
         markError(my, errQuoteExpected)
         break
-      of '&': 
-        if pendingSpace: 
+      of '&':
+        if pendingSpace:
           add(my.b, ' ')
           pendingSpace = false
         my.bufpos = pos
         parseEntity(my, my.b)
         my.kind = xmlAttribute # parseEntity overwrites my.kind!
         pos = my.bufpos
-      of ' ', '\t': 
+      of ' ', '\t':
         pendingSpace = true
         inc(pos)
-      of '\c':  
+      of '\c':
         pos = lexbase.handleCR(my, pos)
         buf = my.buf
         pendingSpace = true
-      of '\L': 
+      of '\L':
         pos = lexbase.handleLF(my, pos)
         buf = my.buf
         pendingSpace = true
@@ -520,44 +540,48 @@ proc parseAttribute(my: var XmlParser) =
           inc(pos)
           break
         else:
-          if pendingSpace: 
+          if pendingSpace:
             add(my.b, ' ')
             pendingSpace = false
           add(my.b, buf[pos])
           inc(pos)
   else:
-    markError(my, errQuoteExpected)  
+    markError(my, errQuoteExpected)
   my.bufpos = pos
   parseWhitespace(my, skip=true)
-  
-proc parseCharData(my: var XmlParser) = 
+
+proc parseCharData(my: var XmlParser) =
   var pos = my.bufpos
   var buf = my.buf
-  while true: 
+  while true:
     case buf[pos]
     of '\0', '<', '&': break
-    of '\c':  
+    of '\c':
       # the specification says that CR-LF, CR are to be transformed to LF
       pos = lexbase.handleCR(my, pos)
       buf = my.buf
       add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
       buf = my.buf
       add(my.a, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      buf = my.buf
+      add(my.a, '/')
     else:
       add(my.a, buf[pos])
       inc(pos)
   my.bufpos = pos
   my.kind = xmlCharData
 
-proc rawGetTok(my: var XmlParser) = 
+proc rawGetTok(my: var XmlParser) =
   my.kind = xmlError
   setLen(my.a, 0)
   var pos = my.bufpos
   var buf = my.buf
   case buf[pos]
-  of '<': 
+  of '<':
     case buf[pos+1]
     of '/':
       parseEndTag(my)
@@ -566,44 +590,44 @@ proc rawGetTok(my: var XmlParser) =
           buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and
           buf[pos+8] == '[':
         parseCDATA(my)
-      elif buf[pos+2] == '-' and buf[pos+3] == '-': 
+      elif buf[pos+2] == '-' and buf[pos+3] == '-':
         parseComment(my)
-      else: 
+      else:
         parseSpecial(my)
     of '?':
       parsePI(my)
-    else: 
+    else:
       parseTag(my)
-  of ' ', '\t', '\c', '\l': 
+  of ' ', '\t', '\c', '\l':
     parseWhitespace(my)
     my.kind = xmlWhitespace
-  of '\0': 
+  of '\0':
     my.kind = xmlEof
   of '&':
     parseEntity(my, my.a)
-  else: 
+  else:
     parseCharData(my)
   assert my.kind != xmlError
-    
-proc getTok(my: var XmlParser) = 
+
+proc getTok(my: var XmlParser) =
   while true:
     rawGetTok(my)
     case my.kind
-    of xmlComment: 
+    of xmlComment:
       if my.options.contains(reportComments): break
-    of xmlWhitespace: 
+    of xmlWhitespace:
       if my.options.contains(reportWhitespace): break
     else: break
-    
-proc next*(my: var XmlParser) = 
+
+proc next*(my: var XmlParser) =
   ## retrieves the first/next event. This controls the parser.
   case my.state
   of stateNormal:
-    getTok(my)  
+    getTok(my)
   of stateStart:
     my.state = stateNormal
     getTok(my)
-    if my.kind == xmlPI and my.a == "xml": 
+    if my.kind == xmlPI and my.a == "xml":
       # just skip the first ``<?xml >`` processing instruction
       getTok(my)
   of stateAttr:
@@ -612,10 +636,14 @@ proc next*(my: var XmlParser) =
       my.kind = xmlElementClose
       inc(my.bufpos)
       my.state = stateNormal
-    elif my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>': 
-      my.kind = xmlElementClose
-      inc(my.bufpos, 2)
-      my.state = stateEmptyElementTag
+    elif my.buf[my.bufpos] == '/':
+      my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
+      if my.buf[my.bufpos] == '>':
+        my.kind = xmlElementClose
+        inc(my.bufpos)
+        my.state = stateEmptyElementTag
+      else:
+        markError(my, errGtExpected)
     else:
       parseAttribute(my)
       # state remains the same
@@ -624,10 +652,10 @@ proc next*(my: var XmlParser) =
     my.kind = xmlElementEnd
     if not my.c.isNil:
       my.a = my.c
-  of stateError: 
+  of stateError:
     my.kind = xmlError
     my.state = stateNormal
-  
+
 when not defined(testing) and isMainModule:
   import os
   var s = newFileStream(paramStr(1), fmRead)
@@ -645,13 +673,13 @@ when not defined(testing) and isMainModule:
     of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest])
     of xmlElementStart: echo("<$1>" % x.elementName)
     of xmlElementEnd: echo("</$1>" % x.elementName)
-    
-    of xmlElementOpen: echo("<$1" % x.elementName) 
-    of xmlAttribute:   
+
+    of xmlElementOpen: echo("<$1" % x.elementName)
+    of xmlAttribute:
       echo("Key: " & x.attrKey)
       echo("Value: " & x.attrValue)
-    
-    of xmlElementClose: echo(">") 
+
+    of xmlElementClose: echo(">")
     of xmlCData:
       echo("<![CDATA[$1]]>" % x.charData)
     of xmlEntity:
author	Araq <rumpf_a@web.de>	2015-07-01 15:47:15 +0200
committer	Araq <rumpf_a@web.de>	2015-07-01 15:47:15 +0200
commit	0d7e0e1b4fb9e99259eb9f2a1ad42a7c0136e48b (patch)
tree	f5dda64458a2a7f3f2b438a530aea2aedb74e773
parent	13259c669dc1dcdd72c88aedfe8689fd333288d3 (diff)
download	Nim-0d7e0e1b4fb9e99259eb9f2a1ad42a7c0136e48b.tar.gz