summary refs log tree commit diff stats
path: root/lib/pure/parsexml.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/parsexml.nim')
-rw-r--r--lib/pure/parsexml.nim664
1 files changed, 413 insertions, 251 deletions
diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim
index 2663c5b2f..c760799a2 100644
--- a/lib/pure/parsexml.nim
+++ b/lib/pure/parsexml.nim
@@ -8,99 +8,201 @@
 #
 
 ## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
-## parser. 
+## parser.
 ## The only encoding that is supported is UTF-8. The parser has been designed
-## to be somewhat error correcting, so that even most "wild HTML" found on the 
+## to be somewhat error correcting, so that even most "wild HTML" found on the
 ## web can be parsed with it. **Note:** This parser does not check that each
-## ``<tag>`` has a corresponding ``</tag>``! These checks have do be 
-## implemented by the client code for various reasons: 
+## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
+## implemented by the client code for various reasons:
 ##
 ## * Old HTML contains tags that have no end tag: ``<br>`` for example.
 ## * HTML tags are case insensitive, XML tags are case sensitive. Since this
 ##   library can parse both, only the client knows which comparison is to be
 ##   used.
 ## * Thus the checks would have been very difficult to implement properly with
-##   little benefit, especially since they are simple to implement in the 
+##   little benefit, especially since they are simple to implement in the
 ##   client. The client should use the `errorMsgExpected` proc to generate
 ##   a nice error message that fits the other error messages this library
 ##   creates.
 ##
 ##
-## Example 1: Retrieve HTML title
-## ==============================
-##
-## The file ``examples/htmltitle.nim`` demonstrates how to use the 
-## XML parser to accomplish a simple task: To determine the title of an HTML
-## document.
-##
-## .. code-block:: nim
-##     :file: examples/htmltitle.nim
-##
-##
-## Example 2: Retrieve all HTML links
-## ==================================
-##
-## The file ``examples/htmlrefs.nim`` demonstrates how to use the 
-## XML parser to accomplish another simple task: To determine all the links 
-## an HTML document contains.
-##
-## .. code-block:: nim
-##     :file: examples/htmlrefs.nim
-##
 
-import 
-  hashes, strutils, lexbase, streams, unicode
+##[
+
+Example 1: Retrieve HTML title
+==============================
+
+The file ``examples/htmltitle.nim`` demonstrates how to use the
+XML parser to accomplish a simple task: To determine the title of an HTML
+document.
+
+  ```nim
+  # Example program to show the parsexml module
+  # This program reads an HTML file and writes its title to stdout.
+  # Errors and whitespace are ignored.
+
+  import std/[os, streams, parsexml, strutils]
+
+  if paramCount() < 1:
+    quit("Usage: htmltitle filename[.html]")
+
+  var filename = addFileExt(paramStr(1), "html")
+  var s = newFileStream(filename, fmRead)
+  if s == nil: quit("cannot open the file " & filename)
+  var x: XmlParser
+  open(x, s, filename)
+  while true:
+    x.next()
+    case x.kind
+    of xmlElementStart:
+      if cmpIgnoreCase(x.elementName, "title") == 0:
+        var title = ""
+        x.next()  # skip "<title>"
+        while x.kind == xmlCharData:
+          title.add(x.charData)
+          x.next()
+        if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
+          echo("Title: " & title)
+          quit(0) # Success!
+        else:
+          echo(x.errorMsgExpected("/title"))
+
+    of xmlEof: break # end of file reached
+    else: discard # ignore other events
+
+  x.close()
+  quit("Could not determine title!")
+  ```
+
+]##
+
+##[
+
+Example 2: Retrieve all HTML links
+==================================
+
+The file ``examples/htmlrefs.nim`` demonstrates how to use the
+XML parser to accomplish another simple task: To determine all the links
+an HTML document contains.
+
+  ```nim
+  # Example program to show the new parsexml module
+  # This program reads an HTML file and writes all its used links to stdout.
+  # Errors and whitespace are ignored.
+
+  import std/[os, streams, parsexml, strutils]
+
+  proc `=?=` (a, b: string): bool =
+    # little trick: define our own comparator that ignores case
+    return cmpIgnoreCase(a, b) == 0
+
+  if paramCount() < 1:
+    quit("Usage: htmlrefs filename[.html]")
+
+  var links = 0 # count the number of links
+  var filename = addFileExt(paramStr(1), "html")
+  var s = newFileStream(filename, fmRead)
+  if s == nil: quit("cannot open the file " & filename)
+  var x: XmlParser
+  open(x, s, filename)
+  next(x) # get first event
+  block mainLoop:
+    while true:
+      case x.kind
+      of xmlElementOpen:
+        # the <a href = "xyz"> tag we are interested in always has an attribute,
+        # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
+        if x.elementName =?= "a":
+          x.next()
+          if x.kind == xmlAttribute:
+            if x.attrKey =?= "href":
+              var link = x.attrValue
+              inc(links)
+              # skip until we have an ``xmlElementClose`` event
+              while true:
+                x.next()
+                case x.kind
+                of xmlEof: break mainLoop
+                of xmlElementClose: break
+                else: discard
+              x.next() # skip ``xmlElementClose``
+              # now we have the description for the ``a`` element
+              var desc = ""
+              while x.kind == xmlCharData:
+                desc.add(x.charData)
+                x.next()
+              echo(desc & ": " & link)
+        else:
+          x.next()
+      of xmlEof: break # end of file reached
+      of xmlError:
+        echo(errorMsg(x))
+        x.next()
+      else: x.next() # skip other events
+
+  echo($links & " link(s) found!")
+  x.close()
+  ```
+
+]##
+
+import
+  std/[strutils, lexbase, streams, unicode]
+
+when defined(nimPreviewSlimSystem):
+  import std/[assertions, syncio]
 
 # the parser treats ``<br />`` as ``<br></br>``
 
-#  xmlElementCloseEnd, ## ``/>`` 
-
-type 
-  XmlEventKind* = enum ## enumation of all events that may occur when parsing
-    xmlError,           ## an error occurred during parsing
-    xmlEof,             ## end of file reached
-    xmlCharData,        ## character data
-    xmlWhitespace,      ## whitespace has been parsed
-    xmlComment,         ## a comment has been parsed
-    xmlPI,              ## processing instruction (``<?name something ?>``)
-    xmlElementStart,    ## ``<elem>``
-    xmlElementEnd,      ## ``</elem>``
-    xmlElementOpen,     ## ``<elem 
-    xmlAttribute,       ## ``key = "value"`` pair
-    xmlElementClose,    ## ``>`` 
-    xmlCData,           ## ``<![CDATA[`` ... data ... ``]]>``
-    xmlEntity,          ## &entity;
-    xmlSpecial          ## ``<! ... data ... >``
-    
-  XmlErrorKind* = enum       ## enumeration that lists all errors that can occur
-    errNone,                 ## no error
-    errEndOfCDataExpected,   ## ``]]>`` expected
-    errNameExpected,         ## name expected
-    errSemicolonExpected,    ## ``;`` expected
-    errQmGtExpected,         ## ``?>`` expected
-    errGtExpected,           ## ``>`` expected
-    errEqExpected,           ## ``=`` expected
-    errQuoteExpected,        ## ``"`` or ``'`` expected
-    errEndOfCommentExpected  ## ``-->`` expected
-    
-  ParserState = enum 
+#  xmlElementCloseEnd, ## ``/>``
+
+type
+  XmlEventKind* = enum ## enumeration of all events that may occur when parsing
+    xmlError,          ## an error occurred during parsing
+    xmlEof,            ## end of file reached
+    xmlCharData,       ## character data
+    xmlWhitespace,     ## whitespace has been parsed
+    xmlComment,        ## a comment has been parsed
+    xmlPI,             ## processing instruction (``<?name something ?>``)
+    xmlElementStart,   ## ``<elem>``
+    xmlElementEnd,     ## ``</elem>``
+    xmlElementOpen,    ## ``<elem
+    xmlAttribute,      ## ``key = "value"`` pair
+    xmlElementClose,   ## ``>``
+    xmlCData,          ## ``<![CDATA[`` ... data ... ``]]>``
+    xmlEntity,         ## &entity;
+    xmlSpecial         ## ``<! ... data ... >``
+
+  XmlErrorKind* = enum        ## enumeration that lists all errors that can occur
+    errNone,                  ## no error
+    errEndOfCDataExpected,    ## ``]]>`` expected
+    errNameExpected,          ## name expected
+    errSemicolonExpected,     ## ``;`` expected
+    errQmGtExpected,          ## ``?>`` expected
+    errGtExpected,            ## ``>`` expected
+    errEqExpected,            ## ``=`` expected
+    errQuoteExpected,         ## ``"`` or ``'`` expected
+    errEndOfCommentExpected   ## ``-->`` expected
+    errAttributeValueExpected ## non-empty attribute value expected
+
+  ParserState = enum
     stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
 
-  XmlParseOption* = enum  ## options for the XML parser
-    reportWhitespace,      ## report whitespace
-    reportComments         ## report comments
+  XmlParseOption* = enum ## options for the XML parser
+    reportWhitespace,    ## report whitespace
+    reportComments       ## report comments
+    allowUnquotedAttribs ## allow unquoted attribute values (for HTML)
+    allowEmptyAttribs    ## allow empty attributes (without explicit value)
 
   XmlParser* = object of BaseLexer ## the parser object.
     a, b, c: string
     kind: XmlEventKind
     err: XmlErrorKind
     state: ParserState
+    cIsEmpty: bool
     filename: string
     options: set[XmlParseOption]
 
-{.deprecated: [TXmlParser: XmlParser, TXmlParseOptions: XmlParseOption,
-    TXmlError: XmlErrorKind, TXmlEventKind: XmlEventKind].}
-
 const
   errorMessages: array[XmlErrorKind, string] = [
     "no error",
@@ -111,7 +213,8 @@ const
     "'>' expected",
     "'=' expected",
     "'\"' or \"'\" expected",
-    "'-->' expected"
+    "'-->' expected",
+    "attribute value expected"
   ]
 
 proc open*(my: var XmlParser, input: Stream, filename: string,
@@ -121,178 +224,197 @@ proc open*(my: var XmlParser, input: Stream, filename: string,
   ## the `options` parameter: If `options` contains ``reportWhitespace``
   ## a whitespace token is reported as an ``xmlWhitespace`` event.
   ## If `options` contains ``reportComments`` a comment token is reported as an
-  ## ``xmlComment`` event. 
-  lexbase.open(my, input)
+  ## ``xmlComment`` event.
+  lexbase.open(my, input, 8192, {'\c', '\L', '/'})
   my.filename = filename
   my.state = stateStart
   my.kind = xmlError
   my.a = ""
   my.b = ""
-  my.c = nil
+  my.c = ""
+  my.cIsEmpty = true
   my.options = options
-  
-proc close*(my: var XmlParser) {.inline.} = 
+
+proc close*(my: var XmlParser) {.inline.} =
   ## closes the parser `my` and its associated input stream.
   lexbase.close(my)
 
-proc kind*(my: XmlParser): XmlEventKind {.inline.} = 
+proc kind*(my: XmlParser): XmlEventKind {.inline.} =
   ## returns the current event type for the XML parser
   return my.kind
 
 template charData*(my: XmlParser): string =
-  ## returns the character data for the events: ``xmlCharData``, 
+  ## returns the character data for the events: ``xmlCharData``,
   ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
-  assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData, 
+  ## Raises an assertion in debug mode if ``my.kind`` is not one
+  ## of those events. In release mode, this will not trigger an error
+  ## but the value returned will not be valid.
+  assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
                      xmlSpecial})
   my.a
 
 template elementName*(my: XmlParser): string =
-  ## returns the element name for the events: ``xmlElementStart``, 
+  ## returns the element name for the events: ``xmlElementStart``,
   ## ``xmlElementEnd``, ``xmlElementOpen``
+  ## Raises an assertion in debug mode if ``my.kind`` is not one
+  ## of those events. In release mode, this will not trigger an error
+  ## but the value returned will not be valid.
   assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
   my.a
 
 template entityName*(my: XmlParser): string =
   ## returns the entity name for the event: ``xmlEntity``
+  ## Raises an assertion in debug mode if ``my.kind`` is not
+  ## ``xmlEntity``. In release mode, this will not trigger an error
+  ## but the value returned will not be valid.
   assert(my.kind == xmlEntity)
   my.a
-  
+
 template attrKey*(my: XmlParser): string =
   ## returns the attribute key for the event ``xmlAttribute``
+  ## Raises an assertion in debug mode if ``my.kind`` is not
+  ## ``xmlAttribute``. In release mode, this will not trigger an error
+  ## but the value returned will not be valid.
   assert(my.kind == xmlAttribute)
   my.a
-  
+
 template attrValue*(my: XmlParser): string =
   ## returns the attribute value for the event ``xmlAttribute``
+  ## Raises an assertion in debug mode if ``my.kind`` is not
+  ## ``xmlAttribute``. In release mode, this will not trigger an error
+  ## but the value returned will not be valid.
   assert(my.kind == xmlAttribute)
   my.b
 
 template piName*(my: XmlParser): string =
   ## returns the processing instruction name for the event ``xmlPI``
+  ## Raises an assertion in debug mode if ``my.kind`` is not
+  ## ``xmlPI``. In release mode, this will not trigger an error
+  ## but the value returned will not be valid.
   assert(my.kind == xmlPI)
   my.a
 
 template piRest*(my: XmlParser): string =
   ## returns the rest of the processing instruction for the event ``xmlPI``
+  ## Raises an assertion in debug mode if ``my.kind`` is not
+  ## ``xmlPI``. In release mode, this will not trigger an error
+  ## but the value returned will not be valid.
   assert(my.kind == xmlPI)
   my.b
 
-proc rawData*(my: XmlParser): string {.inline.} =
+proc rawData*(my: var XmlParser): lent string {.inline.} =
   ## returns the underlying 'data' string by reference.
   ## This is only used for speed hacks.
-  shallowCopy(result, my.a)
+  result = my.a
 
-proc rawData2*(my: XmlParser): string {.inline.} =
+proc rawData2*(my: var XmlParser): lent string {.inline.} =
   ## returns the underlying second 'data' string by reference.
   ## This is only used for speed hacks.
-  shallowCopy(result, my.b)
+  result = my.b
 
-proc getColumn*(my: XmlParser): int {.inline.} = 
+proc getColumn*(my: XmlParser): int {.inline.} =
   ## get the current column the parser has arrived at.
   result = getColNumber(my, my.bufpos)
 
-proc getLine*(my: XmlParser): int {.inline.} = 
+proc getLine*(my: XmlParser): int {.inline.} =
   ## get the current line the parser has arrived at.
   result = my.lineNumber
 
-proc getFilename*(my: XmlParser): string {.inline.} = 
+proc getFilename*(my: XmlParser): string {.inline.} =
   ## get the filename of the file that the parser processes.
   result = my.filename
-  
-proc errorMsg*(my: XmlParser): string = 
+
+proc errorMsg*(my: XmlParser): string =
   ## returns a helpful error message for the event ``xmlError``
   assert(my.kind == xmlError)
   result = "$1($2, $3) Error: $4" % [
     my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
 
-proc errorMsgExpected*(my: XmlParser, tag: string): string = 
+proc errorMsgExpected*(my: XmlParser, tag: string): string =
   ## returns an error message "<tag> expected" in the same format as the
-  ## other error messages 
+  ## other error messages
   result = "$1($2, $3) Error: $4" % [
     my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
 
-proc errorMsg*(my: XmlParser, msg: string): string = 
+proc errorMsg*(my: XmlParser, msg: string): string =
   ## returns an error message with text `msg` in the same format as the
-  ## other error messages 
+  ## other error messages
   result = "$1($2, $3) Error: $4" % [
     my.filename, $getLine(my), $getColumn(my), msg]
-    
-proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} = 
+
+proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
   my.err = kind
   my.state = stateError
 
-proc parseCDATA(my: var XmlParser) = 
+proc parseCDATA(my: var XmlParser) =
   var pos = my.bufpos + len("<![CDATA[")
-  var buf = my.buf
   while true:
-    case buf[pos] 
+    case my.buf[pos]
     of ']':
-      if buf[pos+1] == ']' and buf[pos+2] == '>':
+      if my.buf[pos+1] == ']' and my.buf[pos+2] == '>':
         inc(pos, 3)
         break
       add(my.a, ']')
       inc(pos)
-    of '\0': 
+    of '\0':
       markError(my, errEndOfCDataExpected)
       break
-    of '\c': 
+    of '\c':
       pos = lexbase.handleCR(my, pos)
-      buf = my.buf
       add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
-      buf = my.buf
       add(my.a, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      add(my.a, '/')
     else:
-      add(my.a, buf[pos])
-      inc(pos)    
+      add(my.a, my.buf[pos])
+      inc(pos)
   my.bufpos = pos # store back
   my.kind = xmlCData
 
-proc parseComment(my: var XmlParser) = 
+proc parseComment(my: var XmlParser) =
   var pos = my.bufpos + len("<!--")
-  var buf = my.buf
   while true:
-    case buf[pos] 
+    case my.buf[pos]
     of '-':
-      if buf[pos+1] == '-' and buf[pos+2] == '>':
+      if my.buf[pos+1] == '-' and my.buf[pos+2] == '>':
         inc(pos, 3)
         break
       if my.options.contains(reportComments): add(my.a, '-')
       inc(pos)
-    of '\0': 
+    of '\0':
       markError(my, errEndOfCommentExpected)
       break
-    of '\c': 
+    of '\c':
       pos = lexbase.handleCR(my, pos)
-      buf = my.buf
       if my.options.contains(reportComments): add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
-      buf = my.buf
       if my.options.contains(reportComments): add(my.a, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      if my.options.contains(reportComments): add(my.a, '/')
     else:
-      if my.options.contains(reportComments): add(my.a, buf[pos])
+      if my.options.contains(reportComments): add(my.a, my.buf[pos])
       inc(pos)
   my.bufpos = pos
   my.kind = xmlComment
 
-proc parseWhitespace(my: var XmlParser, skip=false) = 
+proc parseWhitespace(my: var XmlParser, skip = false) =
   var pos = my.bufpos
-  var buf = my.buf
-  while true: 
-    case buf[pos]
-    of ' ', '\t': 
-      if not skip: add(my.a, buf[pos])
+  while true:
+    case my.buf[pos]
+    of ' ', '\t':
+      if not skip: add(my.a, my.buf[pos])
       inc(pos)
-    of '\c':  
+    of '\c':
       # the specification says that CR-LF, CR are to be transformed to LF
       pos = lexbase.handleCR(my, pos)
-      buf = my.buf
       if not skip: add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
-      buf = my.buf
       if not skip: add(my.a, '\L')
     else:
       break
@@ -302,84 +424,84 @@ const
   NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
   NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
 
-proc parseName(my: var XmlParser, dest: var string) = 
+proc parseName(my: var XmlParser, dest: var string) =
   var pos = my.bufpos
-  var buf = my.buf
-  if buf[pos] in NameStartChar: 
+  if my.buf[pos] in NameStartChar:
     while true:
-      add(dest, buf[pos])
+      add(dest, my.buf[pos])
       inc(pos)
-      if buf[pos] notin NameChar: break
+      if my.buf[pos] notin NameChar: break
     my.bufpos = pos
   else:
     markError(my, errNameExpected)
 
-proc parseEntity(my: var XmlParser, dest: var string) = 
+proc parseEntity(my: var XmlParser, dest: var string) =
   var pos = my.bufpos+1
-  var buf = my.buf
   my.kind = xmlCharData
-  if buf[pos] == '#':
+  if my.buf[pos] == '#':
     var r: int
     inc(pos)
-    if buf[pos] == 'x': 
+    if my.buf[pos] == 'x':
       inc(pos)
       while true:
-        case buf[pos]
-        of '0'..'9': r = (r shl 4) or (ord(buf[pos]) - ord('0'))
-        of 'a'..'f': r = (r shl 4) or (ord(buf[pos]) - ord('a') + 10)
-        of 'A'..'F': r = (r shl 4) or (ord(buf[pos]) - ord('A') + 10)
+        case my.buf[pos]
+        of '0'..'9': r = (r shl 4) or (ord(my.buf[pos]) - ord('0'))
+        of 'a'..'f': r = (r shl 4) or (ord(my.buf[pos]) - ord('a') + 10)
+        of 'A'..'F': r = (r shl 4) or (ord(my.buf[pos]) - ord('A') + 10)
         else: break
         inc(pos)
     else:
-      while buf[pos] in {'0'..'9'}: 
-        r = r * 10 + (ord(buf[pos]) - ord('0'))
+      while my.buf[pos] in {'0'..'9'}:
+        r = r * 10 + (ord(my.buf[pos]) - ord('0'))
         inc(pos)
     add(dest, toUTF8(Rune(r)))
-  elif buf[pos] == 'l' and buf[pos+1] == 't' and buf[pos+2] == ';':
+  elif my.buf[pos] == 'l' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
     add(dest, '<')
     inc(pos, 2)
-  elif buf[pos] == 'g' and buf[pos+1] == 't' and buf[pos+2] == ';':
+  elif my.buf[pos] == 'g' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
     add(dest, '>')
     inc(pos, 2)
-  elif buf[pos] == 'a' and buf[pos+1] == 'm' and buf[pos+2] == 'p' and
-      buf[pos+3] == ';':
+  elif my.buf[pos] == 'a' and my.buf[pos+1] == 'm' and my.buf[pos+2] == 'p' and
+      my.buf[pos+3] == ';':
     add(dest, '&')
     inc(pos, 3)
-  elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and 
-      buf[pos+3] == 's' and buf[pos+4] == ';':
+  elif my.buf[pos] == 'a' and my.buf[pos+1] == 'p' and my.buf[pos+2] == 'o' and
+      my.buf[pos+3] == 's' and my.buf[pos+4] == ';':
     add(dest, '\'')
     inc(pos, 4)
-  elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and 
-      buf[pos+3] == 't' and buf[pos+4] == ';':
+  elif my.buf[pos] == 'q' and my.buf[pos+1] == 'u' and my.buf[pos+2] == 'o' and
+      my.buf[pos+3] == 't' and my.buf[pos+4] == ';':
     add(dest, '"')
     inc(pos, 4)
   else:
     my.bufpos = pos
-    parseName(my, dest)
+    var name = ""
+    parseName(my, name)
     pos = my.bufpos
-    if my.err != errNameExpected: 
+    if my.err != errNameExpected and my.buf[pos] == ';':
       my.kind = xmlEntity
     else:
       add(dest, '&')
-  if buf[pos] == ';': 
+    add(dest, name)
+  if my.buf[pos] == ';':
     inc(pos)
   else:
-    markError(my, errSemicolonExpected)
+    my.err = errSemicolonExpected
+    # do not overwrite 'my.state' here, it's a benign error
   my.bufpos = pos
 
-proc parsePI(my: var XmlParser) = 
+proc parsePI(my: var XmlParser) =
   inc(my.bufpos, "<?".len)
   parseName(my, my.a)
   var pos = my.bufpos
-  var buf = my.buf
   setLen(my.b, 0)
-  while true: 
-    case buf[pos]
+  while true:
+    case my.buf[pos]
     of '\0':
       markError(my, errQmGtExpected)
       break
     of '?':
-      if buf[pos+1] == '>':
+      if my.buf[pos+1] == '>':
         inc(pos, 2)
         break
       add(my.b, '?')
@@ -387,29 +509,29 @@ proc parsePI(my: var XmlParser) =
     of '\c':
       # the specification says that CR-LF, CR are to be transformed to LF
       pos = lexbase.handleCR(my, pos)
-      buf = my.buf      
       add(my.b, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
-      buf = my.buf
       add(my.b, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      add(my.b, '/')
     else:
-      add(my.b, buf[pos])
+      add(my.b, my.buf[pos])
       inc(pos)
   my.bufpos = pos
   my.kind = xmlPI
 
-proc parseSpecial(my: var XmlParser) = 
+proc parseSpecial(my: var XmlParser) =
   # things that start with <!
   var pos = my.bufpos + 2
-  var buf = my.buf
   var opentags = 0
-  while true: 
-    case buf[pos]
+  while true:
+    case my.buf[pos]
     of '\0':
       markError(my, errGtExpected)
       break
-    of '<': 
+    of '<':
       inc(opentags)
       inc(pos)
       add(my.a, '<')
@@ -420,190 +542,227 @@ proc parseSpecial(my: var XmlParser) =
       dec(opentags)
       inc(pos)
       add(my.a, '>')
-    of '\c':  
+    of '\c':
       pos = lexbase.handleCR(my, pos)
-      buf = my.buf
       add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
-      buf = my.buf
       add(my.a, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      add(my.b, '/')
     else:
-      add(my.a, buf[pos])
+      add(my.a, my.buf[pos])
       inc(pos)
   my.bufpos = pos
   my.kind = xmlSpecial
 
-proc parseTag(my: var XmlParser) = 
+proc parseTag(my: var XmlParser) =
   inc(my.bufpos)
   parseName(my, my.a)
   # if we have no name, do not interpret the '<':
-  if my.a.len == 0: 
+  if my.a.len == 0:
     my.kind = xmlCharData
     add(my.a, '<')
     return
-  parseWhitespace(my, skip=true)
-  if my.buf[my.bufpos] in NameStartChar: 
+  parseWhitespace(my, skip = true)
+  if my.buf[my.bufpos] in NameStartChar:
     # an attribute follows:
     my.kind = xmlElementOpen
     my.state = stateAttr
     my.c = my.a # save for later
+    my.cIsEmpty = false
   else:
     my.kind = xmlElementStart
-    if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>':
-      inc(my.bufpos, 2)
+    let slash = my.buf[my.bufpos] == '/'
+    if slash:
+      my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
+    if slash and my.buf[my.bufpos] == '>':
+      inc(my.bufpos)
       my.state = stateEmptyElementTag
-      my.c = nil
+      my.c = ""
+      my.cIsEmpty = true
     elif my.buf[my.bufpos] == '>':
-      inc(my.bufpos)  
+      inc(my.bufpos)
     else:
       markError(my, errGtExpected)
-  
-proc parseEndTag(my: var XmlParser) = 
-  inc(my.bufpos, 2)
+
+proc parseEndTag(my: var XmlParser) =
+  my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1)
+  #inc(my.bufpos, 2)
   parseName(my, my.a)
-  parseWhitespace(my, skip=true)
+  parseWhitespace(my, skip = true)
   if my.buf[my.bufpos] == '>':
     inc(my.bufpos)
   else:
     markError(my, errGtExpected)
   my.kind = xmlElementEnd
 
-proc parseAttribute(my: var XmlParser) = 
+proc parseAttribute(my: var XmlParser) =
   my.kind = xmlAttribute
   setLen(my.a, 0)
   setLen(my.b, 0)
   parseName(my, my.a)
   # if we have no name, we have '<tag attr= key %&$$%':
-  if my.a.len == 0: 
+  if my.a.len == 0:
     markError(my, errGtExpected)
     return
-  parseWhitespace(my, skip=true)
+
+  let startPos = my.bufpos
+  parseWhitespace(my, skip = true)
   if my.buf[my.bufpos] != '=':
-    markError(my, errEqExpected)
+    if allowEmptyAttribs notin my.options or
+        (my.buf[my.bufpos] != '>' and my.bufpos == startPos):
+      markError(my, errEqExpected)
     return
+
   inc(my.bufpos)
-  parseWhitespace(my, skip=true)
+  parseWhitespace(my, skip = true)
 
   var pos = my.bufpos
-  var buf = my.buf
-  if buf[pos] in {'\'', '"'}:
-    var quote = buf[pos]
+  if my.buf[pos] in {'\'', '"'}:
+    var quote = my.buf[pos]
     var pendingSpace = false
     inc(pos)
-    while true: 
-      case buf[pos]
+    while true:
+      case my.buf[pos]
       of '\0':
         markError(my, errQuoteExpected)
         break
-      of '&': 
-        if pendingSpace: 
+      of '&':
+        if pendingSpace:
           add(my.b, ' ')
           pendingSpace = false
         my.bufpos = pos
         parseEntity(my, my.b)
         my.kind = xmlAttribute # parseEntity overwrites my.kind!
         pos = my.bufpos
-      of ' ', '\t': 
+      of ' ', '\t':
         pendingSpace = true
         inc(pos)
-      of '\c':  
+      of '\c':
         pos = lexbase.handleCR(my, pos)
-        buf = my.buf
         pendingSpace = true
-      of '\L': 
+      of '\L':
         pos = lexbase.handleLF(my, pos)
-        buf = my.buf
         pendingSpace = true
+      of '/':
+        pos = lexbase.handleRefillChar(my, pos)
+        add(my.b, '/')
       else:
-        if buf[pos] == quote:
+        if my.buf[pos] == quote:
           inc(pos)
           break
         else:
-          if pendingSpace: 
+          if pendingSpace:
             add(my.b, ' ')
             pendingSpace = false
-          add(my.b, buf[pos])
+          add(my.b, my.buf[pos])
           inc(pos)
+  elif allowUnquotedAttribs in my.options:
+    const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ',
+                             '\0', '\t', '\L', '\F', '\f'}
+    let startPos = pos
+    while (let c = my.buf[pos]; c notin disallowedChars):
+      if c == '&':
+        my.bufpos = pos
+        parseEntity(my, my.b)
+        my.kind = xmlAttribute # parseEntity overwrites my.kind!
+        pos = my.bufpos
+      elif c == '/':
+        pos = lexbase.handleRefillChar(my, pos)
+        add(my.b, '/')
+      else:
+        add(my.b, c)
+        inc(pos)
+    if pos == startPos:
+      markError(my, errAttributeValueExpected)
   else:
-    markError(my, errQuoteExpected)  
+    markError(my, errQuoteExpected)
+    # error corrections: guess what was meant
+    while my.buf[pos] != '>' and my.buf[pos] > ' ':
+      add(my.b, my.buf[pos])
+      inc pos
   my.bufpos = pos
-  parseWhitespace(my, skip=true)
-  
-proc parseCharData(my: var XmlParser) = 
+  parseWhitespace(my, skip = true)
+
+proc parseCharData(my: var XmlParser) =
   var pos = my.bufpos
-  var buf = my.buf
-  while true: 
-    case buf[pos]
+  while true:
+    case my.buf[pos]
     of '\0', '<', '&': break
-    of '\c':  
+    of '\c':
       # the specification says that CR-LF, CR are to be transformed to LF
       pos = lexbase.handleCR(my, pos)
-      buf = my.buf
       add(my.a, '\L')
-    of '\L': 
+    of '\L':
       pos = lexbase.handleLF(my, pos)
-      buf = my.buf
       add(my.a, '\L')
+    of '/':
+      pos = lexbase.handleRefillChar(my, pos)
+      add(my.a, '/')
     else:
-      add(my.a, buf[pos])
+      add(my.a, my.buf[pos])
       inc(pos)
   my.bufpos = pos
   my.kind = xmlCharData
 
-proc rawGetTok(my: var XmlParser) = 
+proc rawGetTok(my: var XmlParser) =
   my.kind = xmlError
   setLen(my.a, 0)
   var pos = my.bufpos
-  var buf = my.buf
-  case buf[pos]
-  of '<': 
-    case buf[pos+1]
+  case my.buf[pos]
+  of '<':
+    case my.buf[pos+1]
     of '/':
       parseEndTag(my)
     of '!':
-      if buf[pos+2] == '[' and buf[pos+3] == 'C' and buf[pos+4] == 'D' and
-          buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and
-          buf[pos+8] == '[':
+      if my.buf[pos+2] == '[' and my.buf[pos+3] == 'C' and
+          my.buf[pos+4] == 'D' and my.buf[pos+5] == 'A' and
+          my.buf[pos+6] == 'T' and my.buf[pos+7] == 'A' and
+          my.buf[pos+8] == '[':
         parseCDATA(my)
-      elif buf[pos+2] == '-' and buf[pos+3] == '-': 
+      elif my.buf[pos+2] == '-' and my.buf[pos+3] == '-':
         parseComment(my)
-      else: 
+      else:
         parseSpecial(my)
     of '?':
       parsePI(my)
-    else: 
+    else:
       parseTag(my)
-  of ' ', '\t', '\c', '\l': 
+  of ' ', '\t', '\c', '\l':
     parseWhitespace(my)
     my.kind = xmlWhitespace
-  of '\0': 
+  of '\0':
     my.kind = xmlEof
   of '&':
     parseEntity(my, my.a)
-  else: 
+  else:
     parseCharData(my)
   assert my.kind != xmlError
-    
-proc getTok(my: var XmlParser) = 
+
+proc getTok(my: var XmlParser) =
   while true:
+    let lastKind = my.kind
     rawGetTok(my)
     case my.kind
-    of xmlComment: 
+    of xmlComment:
       if my.options.contains(reportComments): break
-    of xmlWhitespace: 
-      if my.options.contains(reportWhitespace): break
+    of xmlWhitespace:
+      if my.options.contains(reportWhitespace) or lastKind in {xmlCharData,
+          xmlComment, xmlEntity}:
+        break
     else: break
-    
-proc next*(my: var XmlParser) = 
+
+proc next*(my: var XmlParser) =
   ## retrieves the first/next event. This controls the parser.
   case my.state
   of stateNormal:
-    getTok(my)  
+    getTok(my)
   of stateStart:
     my.state = stateNormal
     getTok(my)
-    if my.kind == xmlPI and my.a == "xml": 
+    if my.kind == xmlPI and my.a == "xml":
       # just skip the first ``<?xml >`` processing instruction
       getTok(my)
   of stateAttr:
@@ -612,24 +771,28 @@ proc next*(my: var XmlParser) =
       my.kind = xmlElementClose
       inc(my.bufpos)
       my.state = stateNormal
-    elif my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>': 
-      my.kind = xmlElementClose
-      inc(my.bufpos, 2)
-      my.state = stateEmptyElementTag
+    elif my.buf[my.bufpos] == '/':
+      my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
+      if my.buf[my.bufpos] == '>':
+        my.kind = xmlElementClose
+        inc(my.bufpos)
+        my.state = stateEmptyElementTag
+      else:
+        markError(my, errGtExpected)
     else:
       parseAttribute(my)
       # state remains the same
   of stateEmptyElementTag:
     my.state = stateNormal
     my.kind = xmlElementEnd
-    if not my.c.isNil:
+    if not my.cIsEmpty:
       my.a = my.c
-  of stateError: 
+  of stateError:
     my.kind = xmlError
     my.state = stateNormal
-  
-when isMainModule:
-  import os
+
+when not defined(testing) and isMainModule:
+  import std/os
   var s = newFileStream(paramStr(1), fmRead)
   if s == nil: quit("cannot open the file" & paramStr(1))
   var x: XmlParser
@@ -645,13 +808,13 @@ when isMainModule:
     of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest])
     of xmlElementStart: echo("<$1>" % x.elementName)
     of xmlElementEnd: echo("</$1>" % x.elementName)
-    
-    of xmlElementOpen: echo("<$1" % x.elementName) 
-    of xmlAttribute:   
+
+    of xmlElementOpen: echo("<$1" % x.elementName)
+    of xmlAttribute:
       echo("Key: " & x.attrKey)
       echo("Value: " & x.attrValue)
-    
-    of xmlElementClose: echo(">") 
+
+    of xmlElementClose: echo(">")
     of xmlCData:
       echo("<![CDATA[$1]]>" % x.charData)
     of xmlEntity:
@@ -659,4 +822,3 @@ when isMainModule:
     of xmlSpecial:
       echo("SPECIAL: " & x.charData)
   close(x)
-