summary refs log tree commit diff stats
path: root/lib/pure
diff options
context:
space:
mode:
authorAraq <rumpf_a@web.de>2013-06-27 01:04:56 +0200
committerAraq <rumpf_a@web.de>2013-06-27 01:04:56 +0200
commita674c039fb959166681f13f809670807b89e0b41 (patch)
treeec4676bf220ed2e49e8fff9b1071eaf2bfcae3d4 /lib/pure
parentff1d68c50b5df56645eb09a523c08726772bec4d (diff)
downloadNim-a674c039fb959166681f13f809670807b89e0b41.tar.gz
fixed a long standing xml parser bug; html parser bugfixes + speed improvements
Diffstat (limited to 'lib/pure')
-rw-r--r--lib/pure/htmlparser.nim286
-rw-r--r--lib/pure/parsexml.nim19
-rw-r--r--lib/pure/xmltree.nim10
3 files changed, 242 insertions, 73 deletions
diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim
index ad952fc41..c94170803 100644
--- a/lib/pure/htmlparser.nim
+++ b/lib/pure/htmlparser.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2010 Andreas Rumpf
+#        (c) Copyright 2013 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -35,8 +35,12 @@ type
     tagAddress,    ## the HTML ``address`` element
     tagApplet,     ## the deprecated HTML ``applet`` element
     tagArea,       ## the HTML ``area`` element
+    tagArticle,    ## the HTML ``article`` element
+    tagAside,      ## the HTML ``aside`` element
+    tagAudio,      ## the HTML ``audio`` element
     tagB,          ## the HTML ``b`` element
     tagBase,       ## the HTML ``base`` element
+    tagBdi,        ## the HTML ``bdi`` element
     tagBdo,        ## the deprecated HTML ``dbo`` element
     tagBasefont,   ## the deprecated HTML ``basefont`` element
     tagBig,        ## the HTML ``big`` element
@@ -44,22 +48,31 @@ type
     tagBody,       ## the HTML ``body`` element
     tagBr,         ## the HTML ``br`` element
     tagButton,     ## the HTML ``button`` element
+    tagCanvas,     ## the HTML ``canvas`` element
     tagCaption,    ## the HTML ``caption`` element
     tagCenter,     ## the deprecated HTML ``center`` element
     tagCite,       ## the HTML ``cite`` element
     tagCode,       ## the HTML ``code`` element
     tagCol,        ## the HTML ``col`` element
     tagColgroup,   ## the HTML ``colgroup`` element
+    tagCommand,    ## the HTML ``command`` element
+    tagDatalist,   ## the HTML ``datalist`` element
     tagDd,         ## the HTML ``dd`` element
     tagDel,        ## the HTML ``del`` element
+    tagDetails,    ## the HTML ``details`` element
     tagDfn,        ## the HTML ``dfn`` element
+    tagDialog,     ## the HTML ``dialog`` element
     tagDiv,        ## the HTML ``div`` element
     tagDir,        ## the deprecated HTLM ``dir`` element
     tagDl,         ## the HTML ``dl`` element
     tagDt,         ## the HTML ``dt`` element
     tagEm,         ## the HTML ``em`` element
+    tagEmbed,      ## the HTML ``embed`` element
     tagFieldset,   ## the HTML ``fieldset`` element
+    tagFigcaption, ## the HTML ``figcaption`` element
+    tagFigure,     ## the HTML ``figure`` element
     tagFont,       ## the deprecated HTML ``font`` element
+    tagFooter,     ## the HTML ``footer`` element
     tagForm,       ## the HTML ``form`` element
     tagFrame,      ## the HTML ``frame`` element
     tagFrameset,   ## the deprecated HTML ``frameset`` element
@@ -70,6 +83,8 @@ type
     tagH5,         ## the HTML ``h5`` element
     tagH6,         ## the HTML ``h6`` element
     tagHead,       ## the HTML ``head`` element
+    tagHeader,     ## the HTML ``header`` element
+    tagHgroup,     ## the HTML ``hgroup`` element
     tagHtml,       ## the HTML ``html`` element
     tagHr,         ## the HTML ``hr`` element
     tagI,          ## the HTML ``i`` element
@@ -79,13 +94,17 @@ type
     tagIns,        ## the HTML ``ins`` element
     tagIsindex,    ## the deprecated HTML ``isindex`` element
     tagKbd,        ## the HTML ``kbd`` element
+    tagKeygen,     ## the HTML ``keygen`` element
     tagLabel,      ## the HTML ``label`` element
     tagLegend,     ## the HTML ``legend`` element
     tagLi,         ## the HTML ``li`` element
     tagLink,       ## the HTML ``link`` element
     tagMap,        ## the HTML ``map`` element
+    tagMark,       ## the HTML ``mark`` element
     tagMenu,       ## the deprecated HTML ``menu`` element
     tagMeta,       ## the HTML ``meta`` element
+    tagMeter,      ## the HTML ``meter`` element
+    tagNav,        ## the HTML ``nav`` element
     tagNobr,       ## the deprecated HTML ``nobr`` element
     tagNoframes,   ## the deprecated HTML ``noframes`` element
     tagNoscript,   ## the HTML ``noscript`` element
@@ -93,20 +112,28 @@ type
     tagOl,         ## the HTML ``ol`` element
     tagOptgroup,   ## the HTML ``optgroup`` element
     tagOption,     ## the HTML ``option`` element
+    tagOutput,     ## the HTML ``output`` element
     tagP,          ## the HTML ``p`` element
     tagParam,      ## the HTML ``param`` element
     tagPre,        ## the HTML ``pre`` element
+    tagProgress,   ## the HTML ``progress`` element
     tagQ,          ## the HTML ``q`` element
+    tagRp,         ## the HTML ``rp`` element
+    tagRt,         ## the HTML ``rt`` element
+    tagRuby,       ## the HTML ``ruby`` element
     tagS,          ## the deprecated HTML ``s`` element
     tagSamp,       ## the HTML ``samp`` element
     tagScript,     ## the HTML ``script`` element
+    tagSection,    ## the HTML ``section`` element
     tagSelect,     ## the HTML ``select`` element
     tagSmall,      ## the HTML ``small`` element
+    tagSource,     ## the HTML ``source`` element
     tagSpan,       ## the HTML ``span`` element
     tagStrike,     ## the deprecated HTML ``strike`` element
     tagStrong,     ## the HTML ``strong`` element
     tagStyle,      ## the HTML ``style`` element
     tagSub,        ## the HTML ``sub`` element
+    tagSummary,    ## the HTML ``summary`` element
     tagSup,        ## the HTML ``sup`` element
     tagTable,      ## the HTML ``table`` element
     tagTbody,      ## the HTML ``tbody`` element
@@ -115,37 +142,46 @@ type
     tagTfoot,      ## the HTML ``tfoot`` element
     tagTh,         ## the HTML ``th`` element
     tagThead,      ## the HTML ``thead`` element
+    tagTime,       ## the HTML ``time`` element
     tagTitle,      ## the HTML ``title`` element
     tagTr,         ## the HTML ``tr`` element
+    tagTrack,      ## the HTML ``track`` element
     tagTt,         ## the HTML ``tt`` element
     tagU,          ## the deprecated HTML ``u`` element
     tagUl,         ## the HTML ``ul`` element
-    tagVar         ## the HTML ``var`` element
+    tagVar,        ## the HTML ``var`` element
+    tagVideo,      ## the HTML ``video`` element
+    tagWbr         ## the HTML ``wbr`` element
 
 const
   tagToStr* = [
-    "a", "abbr", "acronym", "address", "applet", "area", 
-    "b", "base", "basefont", "bdo", "big", "blockquote", "body", 
-    "br", "button", "caption", "center", "cite", "code", 
-    "col", "colgroup", "dd", "del", "dfn", "div", 
-    "dir", "dl", "dt", "em", "fieldset", "font", 
+    "a", "abbr", "acronym", "address", "applet", "area", "article", 
+    "aside", "audio",
+    "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", 
+    "br", "button", "canvas", "caption", "center", "cite", "code", 
+    "col", "colgroup", "command",
+    "datalist", "dd", "del", "details", "dfn", "dialog", "div", 
+    "dir", "dl", "dt", "em", "embed", "fieldset", 
+    "figcaption", "figure", "font", "footer",
     "form", "frame", "frameset", "h1", "h2", "h3", 
-    "h4", "h5", "h6", "head", "html", "hr", 
+    "h4", "h5", "h6", "head", "header", "hgroup", "html", "hr", 
     "i", "iframe", "img", "input", "ins", "isindex", 
-    "kbd", "label", "legend", "li", "link", "map", 
-    "menu", "meta", "nobr", "noframes", "noscript", "object", "ol", 
-    "optgroup", "option", "p", "param", "pre", "q", 
-    "s", "samp", "script", "select", "small", "span", 
-    "strike", "strong", "style", "sub", "sup", "table", 
-    "tbody", "td", "textarea", "tfoot", "th", "thead", 
-    "title", "tr", "tt", "u", "ul", "var"]
+    "kbd", "keygen", "label", "legend", "li", "link", "map", "mark",
+    "menu", "meta", "meter", "nav", "nobr", "noframes", "noscript", 
+    "object", "ol", 
+    "optgroup", "option", "output", "p", "param", "pre", "progress", "q", 
+    "rp", "rt", "ruby", "s", "samp", "script", "section", "select", "small", 
+    "source", "span", "strike", "strong", "style", 
+    "sub", "summary", "sup", "table", 
+    "tbody", "td", "textarea", "tfoot", "th", "thead", "time",
+    "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr"]
   InlineTags* = {tagA, tagAbbr, tagAcronym, tagApplet, tagB, tagBasefont,
     tagBdo, tagBig, tagBr, tagButton, tagCite, tagCode, tagDel, tagDfn,
     tagEm, tagFont, tagI, tagImg, tagIns, tagInput, tagIframe, tagKbd,
     tagLabel, tagMap, tagObject, tagQ, tagSamp, tagScript, tagSelect,
     tagSmall, tagSpan, tagStrong, tagSub, tagSup, tagTextarea, tagTt,
     tagVar, tagApplet, tagBasefont, tagFont, tagIframe, tagU, tagS, 
-    tagStrike}
+    tagStrike, tagWbr}
   BlockTags* = {tagAddress, tagBlockquote, tagCenter, tagDel, tagDir, tagDiv, 
     tagDl, tagFieldset, tagForm, tagH1, tagH2, tagH3, tagH4, 
     tagH5, tagH6, tagHr, tagIns, tagIsindex, tagMenu, tagNoframes, tagNoscript, 
@@ -153,7 +189,7 @@ const
     tagMenu, tagNoframes}
   SingleTags* = {tagArea, tagBase, tagBasefont, 
     tagBr, tagCol, tagFrame, tagHr, tagImg, tagIsindex,
-    tagLink, tagMeta, tagParam}
+    tagLink, tagMeta, tagParam, tagWbr}
   
   Entities = [
     ("nbsp", 0x00A0), ("iexcl", 0x00A1), ("cent", 0x00A2), ("pound", 0x00A3),
@@ -228,28 +264,149 @@ const
     ("loz", 0x25CA), ("spades", 0x2660), ("clubs", 0x2663),
     ("hearts", 0x2665), ("diams", 0x2666)]
 
-proc binaryStrSearch(x: openarray[string], y: string): int = 
-  ## XXX put this into the library somewhere!
-  var a = 0
-  var b = len(x) - 1
-  while a <= b: 
-    var mid = (a + b) div 2
-    var c = cmp(x[mid], y)
-    if c < 0: a = mid + 1
-    elif c > 0: b = mid - 1
-    else: return mid
-  result = - 1
+proc allLower(s: string): bool =
+  for c in s:
+    if c < 'a' or c > 'z': return false
+  return true
+
+proc toHtmlTag(s: string): THtmlTag =
+  case s
+  of "a": tagA
+  of "abbr": tagAbbr
+  of "acronym": tagAcronym
+  of "address": tagAddress
+  of "applet": tagApplet
+  of "area": tagArea
+  of "article": tagArticle
+  of "aside": tagAside
+  of "audio": tagAudio
+  of "b": tagB
+  of "base": tagBase
+  of "basefont": tagBasefont
+  of "bdi": tagBdi
+  of "bdo": tagBdo
+  of "big": tagBig
+  of "blockquote": tagBlockquote
+  of "body": tagBody
+  of "br": tagBr
+  of "button": tagButton
+  of "canvas": tagCanvas
+  of "caption": tagCaption
+  of "center": tagCenter
+  of "cite": tagCite
+  of "code": tagCode
+  of "col": tagCol
+  of "colgroup": tagColgroup
+  of "command": tagCommand
+  of "datalist": tagDatalist
+  of "dd": tagDd
+  of "del": tagDel
+  of "details": tagDetails
+  of "dfn": tagDfn
+  of "dialog": tagDialog
+  of "div": tagDiv
+  of "dir": tagDir
+  of "dl": tagDl
+  of "dt": tagDt
+  of "em": tagEm
+  of "embed": tagEmbed
+  of "fieldset": tagFieldset
+  of "figcaption": tagFigcaption
+  of "figure": tagFigure
+  of "font": tagFont
+  of "footer": tagFooter
+  of "form": tagForm
+  of "frame": tagFrame
+  of "frameset": tagFrameset
+  of "h1": tagH1
+  of "h2": tagH2
+  of "h3": tagH3
+  of "h4": tagH4
+  of "h5": tagH5
+  of "h6": tagH6
+  of "head": tagHead
+  of "header": tagHeader
+  of "hgroup": tagHgroup
+  of "html": tagHtml
+  of "hr": tagHr
+  of "i": tagI
+  of "iframe": tagIframe
+  of "img": tagImg
+  of "input": tagInput
+  of "ins": tagIns
+  of "isindex": tagIsindex
+  of "kbd": tagKbd
+  of "keygen": tagKeygen
+  of "label": tagLabel
+  of "legend": tagLegend
+  of "li": tagLi
+  of "link": tagLink
+  of "map": tagMap
+  of "mark": tagMark
+  of "menu": tagMenu
+  of "meta": tagMeta
+  of "meter": tagMeter
+  of "nav": tagNav
+  of "nobr": tagNobr
+  of "noframes": tagNoframes
+  of "noscript": tagNoscript
+  of "object": tagObject
+  of "ol": tagOl
+  of "optgroup": tagOptgroup
+  of "option": tagOption
+  of "output": tagOutput
+  of "p": tagP
+  of "param": tagParam
+  of "pre": tagPre
+  of "progress": tagProgress
+  of "q": tagQ
+  of "rp": tagRp
+  of "rt": tagRt
+  of "ruby": tagRuby
+  of "s": tagS
+  of "samp": tagSamp
+  of "script": tagScript
+  of "section": tagSection
+  of "select": tagSelect
+  of "small": tagSmall
+  of "source": tagSource
+  of "span": tagSpan
+  of "strike": tagStrike
+  of "strong": tagStrong
+  of "style": tagStyle
+  of "sub": tagSub
+  of "summary": tagSummary
+  of "sup": tagSup
+  of "table": tagTable
+  of "tbody": tagTbody
+  of "td": tagTd
+  of "textarea": tagTextarea
+  of "tfoot": tagTfoot
+  of "th": tagTh
+  of "thead": tagThead
+  of "time": tagTime
+  of "title": tagTitle
+  of "tr": tagTr
+  of "track": tagTrack
+  of "tt": tagTt
+  of "u": tagU
+  of "ul": tagUl
+  of "var": tagVar
+  of "video": tagVideo
+  of "wbr": tagWbr
+  else: tagUnknown
 
 proc htmlTag*(n: PXmlNode): THtmlTag = 
   ## gets `n`'s tag as a ``THtmlTag``.
   if n.clientData == 0:
-    n.clientData = binaryStrSearch(tagToStr, n.tag)+1
+    n.clientData = toHtmlTag(n.tag).ord
   result = THtmlTag(n.clientData)
 
 proc htmlTag*(s: string): THtmlTag =
   ## converts `s` to a ``THtmlTag``. If `s` is no HTML tag, ``tagUnknown`` is
   ## returned.
-  result = THtmlTag(binaryStrSearch(tagToStr, s.toLower)+1)
+  let s = if allLower(s): s else: s.toLower
+  result = toHtmlTag(s)
 
 proc entityToUtf8*(entity: string): string = 
   ## converts an HTML entity name like ``&Uuml;`` to its UTF-8 equivalent.
@@ -267,11 +424,13 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
 proc expected(x: var TXmlParser, n: PXmlNode): string =
   result = errorMsg(x, "</" & n.tag & "> expected")
 
+template elemName(x: expr): expr = rawData(x)
+
 proc untilElementEnd(x: var TXmlParser, result: PXmlNode, 
                      errors: var seq[string]) =
   # we parsed e.g. ``<br>`` and don't really expect a ``</br>``: 
   if result.htmlTag in singleTags:
-    if x.kind != xmlElementEnd or cmpIgnoreCase(x.elementName, result.tag) != 0:
+    if x.kind != xmlElementEnd or cmpIgnoreCase(x.elemName, result.tag) != 0:
       return
   while true:
     case x.kind
@@ -279,28 +438,29 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
       case result.htmlTag
       of tagLi, tagP, tagDt, tagDd, tagInput, tagOption:
         # some tags are common to have no ``</end>``, like ``<li>``:
-        if htmlTag(x.elementName) in {tagLi, tagP, tagDt, tagDd, tagInput,
-                                      tagOption}:
+        if htmlTag(x.elemName) in {tagLi, tagP, tagDt, tagDd, tagInput,
+                                   tagOption}:
           errors.add(expected(x, result))
           break
-        when false:
-          if htmlTag(x.elementName) notin InlineTags:
-            errors.add(expected(x, result))
-            break
-      of tagTr, tagTd, tagTh, tagTfoot, tagThead:
-        if htmlTag(x.elementName) in {tagTr, tagTd, tagTh, tagTfoot, tagThead}:
+      of tagTd, tagTh, tagTfoot, tagThead:
+        if htmlTag(x.elemName) in {tagTr, tagTd, tagTh, tagTfoot, tagThead}:
+          errors.add(expected(x, result))
+          break
+      of tagTr:
+        if htmlTag(x.elemName) == tagTr:
           errors.add(expected(x, result))
           break
       of tagOptgroup:
-        if htmlTag(x.elementName) in {tagOption, tagOptgroup}:
+        if htmlTag(x.elemName) in {tagOption, tagOptgroup}:
           errors.add(expected(x, result))
           break
       else: nil
       result.addNode(parse(x, errors))
     of xmlElementEnd: 
-      if cmpIgnoreCase(x.elementName, result.tag) == 0: 
+      if cmpIgnoreCase(x.elemName, result.tag) == 0: 
         next(x)
       else:
+        echo "5; expected: ", result.htmltag, " ", x.elemName 
         errors.add(expected(x, result))
         # do not skip it here!
       break
@@ -313,10 +473,10 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
 proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
   case x.kind
   of xmlComment: 
-    result = newComment(x.charData)
+    result = newComment(x.rawData)
     next(x)
   of xmlCharData, xmlWhitespace:
-    result = newText(x.charData)
+    result = newText(x.rawData)
     next(x)
   of xmlPI, xmlSpecial:
     # we just ignore processing instructions for now
@@ -325,19 +485,19 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
     errors.add(errorMsg(x))
     next(x)
   of xmlElementStart:
-    result = newElement(x.elementName.toLower)
+    result = newElement(x.elemName.toLower)
     next(x)
     untilElementEnd(x, result, errors)
   of xmlElementEnd:
-    errors.add(errorMsg(x, "unexpected ending tag: " & x.elementName))
+    errors.add(errorMsg(x, "unexpected ending tag: " & x.elemName))
   of xmlElementOpen: 
-    result = newElement(x.elementName.toLower)
+    result = newElement(x.elemName.toLower)
     next(x)
     result.attrs = newStringTable()
     while true: 
       case x.kind
       of xmlAttribute:
-        result.attrs[x.attrKey] = x.attrValue
+        result.attrs[x.rawData] = x.rawData2
         next(x)
       of xmlElementClose:
         next(x)
@@ -355,10 +515,10 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
     errors.add(errorMsg(x, "<some_tag> expected"))
     next(x)
   of xmlCData: 
-    result = newCData(x.charData)
+    result = newCData(x.rawData)
     next(x)
   of xmlEntity:
-    var u = entityToUtf8(x.entityName)
+    var u = entityToUtf8(x.rawData)
     if u.len != 0: result = newText(u)
     next(x)
   of xmlEof: nil
@@ -372,15 +532,17 @@ proc parseHtml*(s: PStream, filename: string,
   next(x)
   # skip the DOCTYPE:
   if x.kind == xmlSpecial: next(x)
-  result = parse(x, errors)
-  if x.kind != xmlEof:
-    errors.add(errorMsg(x, "EOF expected"))
+  
+  result = newElement("document")
+  result.addNode(parse(x, errors))
+  #if x.kind != xmlEof:
+  #  errors.add(errorMsg(x, "EOF expected"))
   while x.kind != xmlEof:
     var oldPos = x.bufpos # little hack to see if we made any progess
     result.addNode(parse(x, errors))
-    if x.bufpos == oldPos: 
+    if x.bufpos == oldPos:
       # force progress!
-      next(x) 
+      next(x)
   close(x)
 
 proc parseHtml*(s: PStream): PXmlNode = 
@@ -400,22 +562,9 @@ proc loadHtml*(path: string, errors: var seq[string]): PXmlNode =
 proc loadHtml*(path: string): PXmlNode = 
   ## Loads and parses HTML from file specified by ``path``, and returns 
   ## a ``PXmlNode``. All parsing errors are ignored.
-  var errors: seq[string] = @[]  
+  var errors: seq[string] = @[]
   result = loadHtml(path, errors)
 
-when true:
-  nil
-else:
-  proc checkHtmlAux(n: PXmlNode, errors: var seq[string]) =
-    nil
-  
-  proc checkHtmlStructure*(n: PXmlNode, errors: var seq[string]) =
-    ## checks the HTML structure after parsing for other errors like 
-    ## a ``<h1>`` element within a ``<p>`` element.
-    if n == nil or n.htmlTag != tagHtml: 
-      errors.add("<html> tag expected")
-    checkHtmlAux(n, errors)
-  
 when isMainModule:
   import os
 
@@ -429,4 +578,3 @@ when isMainModule:
     f.close()
   else:
     quit("cannot write test.txt")
-  
diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim
index c49986087..667b8aed6 100644
--- a/lib/pure/parsexml.nim
+++ b/lib/pure/parsexml.nim
@@ -131,6 +131,10 @@ proc close*(my: var TXmlParser) {.inline.} =
   ## closes the parser `my` and its associated input stream.
   lexbase.close(my)
 
+proc kind*(my: TXmlParser): TXmlEventKind {.inline.} = 
+  ## returns the current event type for the XML parser
+  return my.kind
+
 proc charData*(my: TXmlParser): string {.inline.} = 
   ## returns the character data for the events: ``xmlCharData``, 
   ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
@@ -138,10 +142,6 @@ proc charData*(my: TXmlParser): string {.inline.} =
                      xmlSpecial})
   return my.a
 
-proc kind*(my: TXmlParser): TXmlEventKind {.inline.} = 
-  ## returns the current event type for the XML parser
-  return my.kind
-
 proc elementName*(my: TXmlParser): string {.inline.} = 
   ## returns the element name for the events: ``xmlElementStart``, 
   ## ``xmlElementEnd``, ``xmlElementOpen``
@@ -173,6 +173,16 @@ proc PIRest*(my: TXmlParser): string {.inline.} =
   assert(my.kind == xmlPI)
   return my.b
 
+proc rawData*(my: TXmlParser): string {.inline.} =
+  ## returns the underlying 'data' string by reference.
+  ## This is only used for speed hacks.
+  shallowCopy(result, my.a)
+
+proc rawData2*(my: TXmlParser): string {.inline.} =
+  ## returns the underlying second 'data' string by reference.
+  ## This is only used for speed hacks.
+  shallowCopy(result, my.b)
+
 proc getColumn*(my: TXmlParser): int {.inline.} = 
   ## get the current column the parser has arrived at.
   result = getColNumber(my, my.bufPos)
@@ -439,6 +449,7 @@ proc parseTag(my: var TXmlParser) =
     if my.buf[my.bufpos] == '/' and my.buf[my.bufpos+1] == '>':
       inc(my.bufpos, 2)
       my.state = stateEmptyElementTag
+      my.c = nil
     elif my.buf[my.bufpos] == '>':
       inc(my.bufpos)  
     else:
diff --git a/lib/pure/xmltree.nim b/lib/pure/xmltree.nim
index e3fc27237..d5821d8f3 100644
--- a/lib/pure/xmltree.nim
+++ b/lib/pure/xmltree.nim
@@ -71,6 +71,16 @@ proc text*(n: PXmlNode): string {.inline.} =
   assert n.k in {xnText, xnComment, xnCData, xnEntity}
   result = n.fText
 
+proc rawText*(n: PXmlNode): string {.inline.} =
+  ## returns the underlying 'text' string by reference.
+  ## This is only used for speed hacks.
+  shallowCopy(result, n.fText)
+
+proc rawTag*(n: PXmlNode): string {.inline.} =
+  ## returns the underlying 'tag' string by reference.
+  ## This is only used for speed hacks.
+  shallowCopy(result, n.fTag)
+
 proc innerText*(n: PXmlNode): string =
   ## gets the inner text of `n`. `n` has to be an ``xnElement`` node. Only
   ## ``xnText`` and ``xnEntity`` nodes are considered part of `n`'s inner text,