bugfixes for unicode; xmlparser; htmlparser; scanner

author: rumpf_a@web.de <> 2010-02-20 19:21:38 +0100
committer: rumpf_a@web.de <> 2010-02-20 19:21:38 +0100
commit: 6bc16904edd3738ab97573b9eeb3a6a7cce9574c (patch)
tree: a24577d18f693a0b5497ad78b54c4d20cb711fc6 /lib
parent: 64da2f16813bbf03b8a2117d7c4abffd1adf525f (diff)
download: Nim-6bc16904edd3738ab97573b9eeb3a6a7cce9574c.tar.gz
7 files changed, 175 insertions, 60 deletions
diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim
index 982fdd088..278bf9b90 100644
--- a/lib/pure/htmlparser.nim
+++ b/lib/pure/htmlparser.nim
@@ -265,7 +265,7 @@ proc addNode(father, son: PXmlNode) =
 proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
 
 proc expected(x: var TXmlParser, n: PXmlNode): string =
-  result = errorMsg(x, "</" & n.tag & "$1> expected")
+  result = errorMsg(x, "</" & n.tag & "> expected")
 
 proc untilElementEnd(x: var TXmlParser, result: PXmlNode, 
                      errors: var seq[string]) =
@@ -378,17 +378,19 @@ proc parseHtml*(s: PStream): PXmlNode =
   var errors: seq[string] = @[]
   result = parseHtml(s, "unknown_html_doc", errors)
 
-proc loadHtml*(path: string, reportErrors = false): PXmlNode = 
+proc loadHtml*(path: string, errors: var seq[string]): PXmlNode = 
   ## Loads and parses HTML from file specified by ``path``, and returns 
-  ## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
-  ## ``echo``ed, otherwise they are ignored.
+  ## a ``PXmlNode``.  Every occured parsing error is added to
+  ## the `errors` sequence.
   var s = newFileStream(path, fmRead)
   if s == nil: raise newException(EIO, "Unable to read file: " & path)
-  
-  var errors: seq[string] = @[]
   result = parseHtml(s, path, errors)
-  if reportErrors: 
-    for msg in items(errors): echo(msg)
+
+proc loadHtml*(path: string): PXmlNode = 
+  ## Loads and parses HTML from file specified by ``path``, and returns 
+  ## a ``PXmlNode``. All parsing errors are ignored.
+  var errors: seq[string] = @[]  
+  result = loadHtml(path, errors)
 
 when true:
   nil
@@ -402,4 +404,18 @@ else:
     if n == nil or n.htmlTag != tagHtml: 
       errors.add("<html> tag expected")
     checkHtmlAux(n, errors)
+  
+when isMainModule:
+  import os
+
+  var errors: seq[string] = @[]  
+  var x = loadHtml(paramStr(1), errors)
+  for e in items(errors): echo e
+  
+  var f: TFile
+  if open(f, "test.txt", fmWrite):
+    f.write($x)
+    f.close()
+  else:
+    quit("cannot write test.txt")
   
\ No newline at end of file
diff --git a/lib/pure/os.nim b/lib/pure/os.nim
index ef526993a..4bb25098d 100644
--- a/lib/pure/os.nim
+++ b/lib/pure/os.nim
@@ -1,7 +1,7 @@
 #
 #
 #            Nimrod's Runtime Library
-#        (c) Copyright 2009 Andreas Rumpf
+#        (c) Copyright 2010 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
@@ -619,9 +619,11 @@ proc sameFileContent*(path1, path2: string): bool =
   close(a)
   close(b)
 
-proc copyFile*(dest, source: string) =
+proc copyFile*(dest, source: string) {.deprecated.} =
   ## Copies a file from `source` to `dest`. If this fails,
   ## `EOS` is raised.
+  ## **Deprecated since version 0.8.8**: Use this proc with named arguments
+  ## only, because the order will change!
   when defined(Windows):
     if CopyFileA(source, dest, 0'i32) == 0'i32: OSError()
   else:
@@ -647,8 +649,10 @@ proc copyFile*(dest, source: string) =
     close(s)
     close(d)
 
-proc moveFile*(dest, source: string) =
+proc moveFile*(dest, source: string) {.deprecated.} =
   ## Moves a file from `source` to `dest`. If this fails, `EOS` is raised.
+  ## **Deprecated since version 0.8.8**: Use this proc with named arguments
+  ## only, because the order will change!
   if crename(source, dest) != 0'i32: OSError()
 
 proc removeFile*(file: string) =
diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim
index bebbe56c5..099509afe 100644
--- a/lib/pure/unicode.nim
+++ b/lib/pure/unicode.nim
@@ -83,8 +83,8 @@ proc toUTF8*(c: TRune): string =
     result[0] = chr(i)
   elif i <=% 0x07FF:
     result = newString(2)
-    result[0] = chr(i shr 6 or 0b110_0000)
-    result[1] = chr(i and ones(6) or 0b10_000000)
+    result[0] = chr((i shr 6) or 0b110_00000)
+    result[1] = chr((i and ones(6)) or 0b10_000000)
   elif i <=% 0xFFFF:
     result = newString(3)
     result[0] = chr(i shr 12 or 0b1110_0000)
diff --git a/lib/pure/xmldom.nim b/lib/pure/xmldom.nim
index 76c666de0..babf60108 100644
--- a/lib/pure/xmldom.nim
+++ b/lib/pure/xmldom.nim
@@ -227,7 +227,7 @@ proc createAttributeNS*(doc: PDocument, namespaceURI: string, qualifiedName: str
     raise newException(EInvalidCharacterErr, "Invalid character")
   # Exceptions
   if qualifiedName.contains(':'):
-    if namespaceURI == nil or namespaceURI == "":
+    if namespaceURI == nil:
       raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
     elif qualifiedName.split(':')[0].toLower() == "xml" and namespaceURI != "http://www.w3.org/XML/1998/namespace":
       raise newException(ENamespaceErr, 
@@ -303,7 +303,7 @@ proc createElement*(doc: PDocument, tagName: string): PElement =
 proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: string): PElement =
   ## Creates an element of the given qualified name and namespace URI.
   if qualifiedName.contains(':'):
-    if namespaceURI == nil or namespaceURI == "":
+    if namespaceURI == nil:
       raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil")
     elif qualifiedName.split(':')[0].toLower() == "xml" and namespaceURI != "http://www.w3.org/XML/1998/namespace":
       raise newException(ENamespaceErr, 
@@ -464,8 +464,11 @@ proc localName*(n: PNode): string =
 
 proc namespaceURI*(n: PNode): string =
   ## Returns this nodes namespace URI
-
+  
   return n.FNamespaceURI
+  
+proc `namespaceURI=`*(n: PNode, value: string) = 
+  n.FNamespaceURI = value
 
 proc nextSibling*(n: PNode): PNode =
   ## Returns the next sibling of this node
@@ -507,7 +510,7 @@ proc previousSibling*(n: PNode): PNode =
       return n.FParentNode.childNodes[i - 1]
   return nil
   
-proc `prefix=`*(n: var PNode, value: string) =
+proc `prefix=`*(n: PNode, value: string) =
   ## Modifies the prefix of this node
 
   # Setter
@@ -530,11 +533,10 @@ proc `prefix=`*(n: var PNode, value: string) =
   if n.nodeType == ElementNode:
     var el: PElement = PElement(n)
     el.FTagName = value & ":" & n.FLocalName
-    n = PNode(el)
+
   elif n.nodeType == AttributeNode:
     var attr: PAttr = PAttr(n)
     attr.FName = value & ":" & n.FLocalName
-    n = PNode(attr)
 
 # Procedures
 proc appendChild*(n: PNode, newChild: PNode) =
@@ -1078,4 +1080,4 @@ proc nodeToXml(n: PNode, indent: int = 0): string =
 proc `$`*(doc: PDocument): string =
   ## Converts a PDocument object into a string representation of it's XML
   result = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
-  result.add(nodeToXml(doc.documentElement))
+  result.add(nodeToXml(doc.documentElement))
\ No newline at end of file
diff --git a/lib/pure/xmldomparser.nim b/lib/pure/xmldomparser.nim
index b73baf1ff..f338ca2e5 100644
--- a/lib/pure/xmldomparser.nim
+++ b/lib/pure/xmldomparser.nim
@@ -14,9 +14,34 @@ import xmldom, os, streams, parsexml, strutils
 #XMLDom's Parser - Turns XML into a Document
 
 type
-  #Parsing errors
+  # Parsing errors
   EMismatchedTag* = object of E_Base ## Raised when a tag is not properly closed
   EParserError* = object of E_Base ## Raised when an unexpected XML Parser event occurs
+
+  # For namespaces
+  xmlnsAttr = tuple[name, value: string, ownerElement: PElement]
+
+var nsList: seq[xmlnsAttr] = @[] # Used for storing namespaces
+
+proc getNS(prefix: string): string =
+  var defaultNS: seq[string] = @[]
+
+  for key, value, tag in items(nsList):
+    if ":" in key:
+      if key.split(':')[1] == prefix:
+        return value
+        
+    if key == "xmlns":
+      defaultNS.add(value)
+      
+  # Don't return the default namespaces
+  # in the loop, because then they would have a precedence
+  # over normal namespaces
+  if defaultNS.len() > 0:
+    return defaultNS[0] # Return the first found default namespace
+                        # if none are specified for this prefix
+    
+  return ""
     
 proc parseText(x: var TXmlParser, doc: var PDocument): PText =
   result = doc.createTextNode(x.charData())
@@ -28,24 +53,33 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
     case x.kind()
     of xmlEof:
       break
-    of xmlElementStart:
-      if n.tagName() != "":
-        n.appendChild(parseElement(x, doc))
-      else:
-        n = doc.createElement(x.elementName)
-    of xmlElementOpen:
+    of xmlElementStart, xmlElementOpen:
       if n.tagName() != "":
         n.appendChild(parseElement(x, doc))
       else:
-        if x.elementName.contains(':'):
-          #TODO: NamespaceURI
-          n = doc.createElementNS("nil", x.elementName)
-        else:  
-          n = doc.createElement(x.elementName)
+        n = doc.createElementNS("", x.elementName)
         
     of xmlElementEnd:
       if x.elementName == n.nodeName:
         # n.normalize() # Remove any whitespace etc.
+        
+        var ns: string
+        if x.elementName.contains(':'):
+          ns = getNS(x.elementName.split(':')[0])
+        else:
+          ns = getNS("")
+        
+        n.namespaceURI = ns
+        
+        # Remove any namespaces this element declared
+        var count = 0 # Variable which keeps the index
+                      # We need to edit it..
+        for i in low(nsList)..len(nsList)-1:
+          if nsList[count][2] == n:
+            nsList.delete(count)
+            dec(count)
+          inc(count)
+
         return n
       else: #The wrong element is ended
         raise newException(EMismatchedTag, "Mismatched tag at line " & 
@@ -54,11 +88,15 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
     of xmlCharData:
       n.appendChild(parseText(x, doc))
     of xmlAttribute:
+      if x.attrKey == "xmlns" or x.attrKey.startsWith("xmlns:"):
+        nsList.add((x.attrKey, x.attrValue, n))
+        
       if x.attrKey.contains(':'):
-        #TODO: NamespaceURI
-        n.setAttributeNS("nil", x.attrKey, x.attrValue)
+        var ns = getNS(x.attrKey)
+        n.setAttributeNS(ns, x.attrKey, x.attrValue)
       else:
         n.setAttribute(x.attrKey, x.attrValue)
+
     of xmlCData:
       n.appendChild(doc.createCDATASection(x.charData()))
     of xmlComment:
@@ -75,16 +113,13 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement =
 
   raise newException(EMismatchedTag, 
     "Mismatched tag at line " & $x.getLine() & " column " & $x.getColumn)
-    
-proc loadXML*(path: string): PDocument =
-  ## Loads and parses XML from file specified by ``path``, and returns 
+
+proc loadXMLStream*(stream: PStream): PDocument =
+  ## Loads and parses XML from a stream specified by ``stream``, and returns 
   ## a ``PDocument``
-  
-  var s = newFileStream(path, fmRead)
-  if s == nil: raise newException(EIO, "Unable to read file " & path)
 
   var x: TXmlParser
-  open(x, s, path, {reportComments})
+  open(x, stream, nil, {reportComments})
   
   var XmlDoc: PDocument
   var DOM: PDOMImplementation = getDOM()
@@ -102,10 +137,32 @@ proc loadXML*(path: string): PDocument =
     else:
       raise newException(EParserError, "Unexpected XML Parser event")
 
-  close(x)
   return XmlDoc
 
+proc loadXML*(xml: string): PDocument =
+  ## Loads and parses XML from a string specified by ``xml``, and returns 
+  ## a ``PDocument``
+  var s = newStringStream(xml)
+  return loadXMLStream(s)
+  
+    
+proc loadXMLFile*(path: string): PDocument =
+  ## Loads and parses XML from a file specified by ``path``, and returns 
+  ## a ``PDocument``
+  
+  var s = newFileStream(path, fmRead)
+  if s == nil: raise newException(EIO, "Unable to read file " & path)
+  return loadXMLStream(s)
+
 
 when isMainModule:
-  var xml = loadXML(r"C:\Users\Dominik\Desktop\Code\Nimrod\xmldom\test.xml")
-  echo($xml)
+  var xml = loadXMLFile(r"C:\Users\Dominik\Desktop\Code\Nimrod\xmldom\test.xml")
+  #echo(xml.getElementsByTagName("m:test2")[0].namespaceURI)
+  #echo(xml.getElementsByTagName("bla:test")[0].namespaceURI)
+  #echo(xml.getElementsByTagName("test")[0].namespaceURI)
+  for i in items(xml.getElementsByTagName("*")):
+    if i.namespaceURI != nil:
+      echo(i.nodeName, "=", i.namespaceURI)
+
+    
+  echo($xml)
\ No newline at end of file
diff --git a/lib/pure/xmltreeparser.nim b/lib/pure/xmlparser.nim
index bf2c05570..635497fa8 100644
--- a/lib/pure/xmltreeparser.nim
+++ b/lib/pure/xmlparser.nim
@@ -25,6 +25,8 @@ proc raiseInvalidXml(errors: seq[string]) =
 proc addNode(father, son: PXmlNode) = 
   if son != nil: add(father, son)
 
+proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode
+
 proc untilElementEnd(x: var TXmlParser, result: PXmlNode, 
                      errors: var seq[string]) =
   while true:
@@ -33,11 +35,11 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
       if x.elementName == result.tag: 
         next(x)
       else:
-        errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
+        errors.add(errorMsg(x, "</" & result.tag & "> expected"))
         # do not skip it here!
       break
     of xmlEof:
-      errors.add(errorMsg(x, "</" & result.tag & "$1> expected"))
+      errors.add(errorMsg(x, "</" & result.tag & "> expected"))
       break
     else:
       result.addNode(parse(x, errors))
@@ -91,7 +93,7 @@ proc parse(x: var TXmlParser, errors: var seq[string]): PXmlNode =
     next(x)
   of xmlEntity:
     ## &entity;
-    ## XXX To implement!
+    errors.add(errorMsg(x, "unknown entity: " & x.entityName))
     next(x)
   of xmlEof: nil
 
@@ -110,6 +112,8 @@ proc parseXml*(s: PStream, filename: string,
     of xmlComment, xmlWhitespace: nil # just skip it
     of xmlError:
       errors.add(errorMsg(x))
+    of xmlSpecial:
+      errors.add(errorMsg(x, "<some_tag> expected"))      
     else:
       errors.add(errorMsg(x, "<some_tag> expected"))
       break
@@ -122,17 +126,33 @@ proc parseXml*(s: PStream): PXmlNode =
   result = parseXml(s, "unknown_html_doc", errors)
   if errors.len > 0: raiseInvalidXMl(errors)
 
-proc loadXml*(path: string, reportErrors = false): PXmlNode = 
+proc loadXml*(path: string, errors: var seq[string]): PXmlNode = 
   ## Loads and parses XML from file specified by ``path``, and returns 
-  ## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are
-  ## ``echo``ed, otherwise an exception is thrown.
+  ## a ``PXmlNode``. Every occured parsing error is added to the `errors`
+  ## sequence.
   var s = newFileStream(path, fmRead)
   if s == nil: raise newException(EIO, "Unable to read file: " & path)
-  
-  var errors: seq[string] = @[]
   result = parseXml(s, path, errors)
-  if reportErrors: 
-    for msg in items(errors): echo(msg)
-  elif errors.len > 0: 
-    raiseInvalidXMl(errors)
 
+proc loadXml*(path: string): PXmlNode = 
+  ## Loads and parses XML from file specified by ``path``, and returns 
+  ## a ``PXmlNode``.  All parsing errors are turned into an ``EInvalidXML``
+  ## exception.  
+  var errors: seq[string] = @[]
+  result = loadXml(path, errors)
+  if errors.len > 0: raiseInvalidXMl(errors)
+
+when isMainModule:
+  import os
+
+  var errors: seq[string] = @[]  
+  var x = loadXml(paramStr(1), errors)
+  for e in items(errors): echo e
+  
+  var f: TFile
+  if open(f, "xmltest.txt", fmWrite):
+    f.write($x)
+    f.close()
+  else:
+    quit("cannot write test.txt")
+    
diff --git a/lib/pure/xmltree.nim b/lib/pure/xmltree.nim
index 2b0977874..7b77fe156 100644
--- a/lib/pure/xmltree.nim
+++ b/lib/pure/xmltree.nim
@@ -153,8 +153,15 @@ proc addIndent(result: var string, indent: int) =
   result.add("\n")
   for i in 1..indent: result.add(' ')
   
+proc noWhitespace(n: PXmlNode): bool =
+  #for i in 1..n.len-1:
+  #  if n[i].kind != n[0].kind: return true
+  for i in 0..n.len-1:
+    if n[i].kind in {xnText, xnEntity}: return true
+  
 proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) = 
   ## adds the textual representation of `n` to `result`.
+  if n == nil: return
   case n.k
   of xnElement:
     result.add('<')
@@ -168,10 +175,19 @@ proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) =
         result.add('"')
     if n.len > 0:
       result.add('>')
-      for i in 0..n.len-1:
-        result.addIndent(indent+indWidth)
-        result.add(n[i], indent+indWidth, indWidth)
-      result.addIndent(indent)
+      if n.len > 1:
+        if noWhitespace(n):
+          # for mixed leaves, we cannot output whitespace for readability,
+          # because this would be wrong. For example: ``a<b>b</b>`` is
+          # different from ``a <b>b</b>``.
+          for i in 0..n.len-1: result.add(n[i], indent+indWidth, indWidth)
+        else: 
+          for i in 0..n.len-1:
+            result.addIndent(indent+indWidth)
+            result.add(n[i], indent+indWidth, indWidth)
+          result.addIndent(indent)
+      else:
+        result.add(n[0], indent+indWidth, indWidth)
       result.add("</")
       result.add(n.fTag)
       result.add(">")
author	rumpf_a@web.de <>	2010-02-20 19:21:38 +0100
committer	rumpf_a@web.de <>	2010-02-20 19:21:38 +0100
commit	6bc16904edd3738ab97573b9eeb3a6a7cce9574c (patch)
tree	a24577d18f693a0b5497ad78b54c4d20cb711fc6 /lib
parent	64da2f16813bbf03b8a2117d7c4abffd1adf525f (diff)
download	Nim-6bc16904edd3738ab97573b9eeb3a6a7cce9574c.tar.gz