diff options
-rwxr-xr-x | doc/lib.txt | 11 | ||||
-rw-r--r-- | lib/pure/htmlparser.nim | 247 | ||||
-rwxr-xr-x | lib/pure/parsexml.nim | 2 | ||||
-rwxr-xr-x | lib/pure/re.nim | 6 | ||||
-rw-r--r-- | lib/pure/xmldom.nim | 244 | ||||
-rw-r--r-- | lib/pure/xmldomparser.nim | 13 | ||||
-rwxr-xr-x | lib/pure/xmlgen.nim | 8 | ||||
-rw-r--r-- | lib/pure/xmltree.nim | 231 | ||||
-rw-r--r-- | lib/pure/xmltreeparser.nim | 52 | ||||
-rwxr-xr-x | lib/system.nim | 4 | ||||
-rwxr-xr-x | web/news.txt | 6 |
11 files changed, 731 insertions, 93 deletions
diff --git a/doc/lib.txt b/doc/lib.txt index 06edf997c..609889607 100755 --- a/doc/lib.txt +++ b/doc/lib.txt @@ -172,7 +172,16 @@ XML Processing This module implements the XML DOM Level 2. * `xmldomparser <xmldomparser.html>`_ - This module parses a XML Document into a XML DOM Document representation. + This module parses an XML Document into a XML DOM Document representation. + +* `xmltree <xmltree.html>`_ + A simple XML tree. More efficient and simpler than the DOM. + +* `xmltreeparser <xmltreeparser.html>`_ + This module parses an XML document and creates its XML tree representation. + +* `htmlparser <htmlparser.html>`_ + This module parses an HTML document and creates its XML tree representation. Code generation diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim new file mode 100644 index 000000000..df840e15c --- /dev/null +++ b/lib/pure/htmlparser.nim @@ -0,0 +1,247 @@ +# +# +# Nimrod's Runtime Library +# (c) Copyright 2010 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module parses an HTML document and creates its XML tree representation. +## It is supposed to handle the *wild* HTML the real world uses. +## +## It can be used to parse a wild HTML document and output it as valid XHTML +## document (if you are lucky): +## +## .. code-block:: nimrod +## +## echo loadHtml("mydirty.html") +## +## +## Every tag in the resulting tree is in lower case. +## +## **Note:** The resulting ``PXmlNode``s already use the ``clientData`` field, +## so it cannot be used by clients of this library. + +import streams, parsexml, xmltree + +type + THtmlTag* = enum ## list of all supported HTML tags; order will always be + ## alphabetically + tagUnknown, ## unknown HTML element + tagA, ## the HTML ``a`` element + tagAcronym, ## the HTML ``acronym`` element + tagAddress, ## the HTML ``address`` element + tagArea, ## the HTML ``area`` element + tagB, ## the HTML ``b`` element + tagBase, ## the HTML ``base`` element + tagBig, ## the HTML ``big`` element + tagBlockquote, ## the HTML ``blockquote`` element + tagBody, ## the HTML ``body`` element + tagBr, ## the HTML ``br`` element + tagButton, ## the HTML ``button`` element + tagCaption, ## the HTML ``caption`` element + tagCite, ## the HTML ``cite`` element + tagCode, ## the HTML ``code`` element + tagCol, ## the HTML ``col`` element + tagColgroup, ## the HTML ``colgroup`` element + tagDd, ## the HTML ``dd`` element + tagDel, ## the HTML ``del`` element + tagDfn, ## the HTML ``dfn`` element + tagDiv, ## the HTML ``div`` element + tagDl, ## the HTML ``dl`` element + tagDt, ## the HTML ``dt`` element + tagEm, ## the HTML ``em`` element + tagFieldset, ## the HTML ``fieldset`` element + tagForm, ## the HTML ``form`` element + tagH1, ## the HTML ``h1`` element + tagH2, ## the HTML ``h2`` element + tagH3, ## the HTML ``h3`` element + tagH4, ## the HTML ``h4`` element + tagH5, ## the HTML ``h5`` element + tagH6, ## the HTML ``h6`` element + tagHead, ## the HTML ``head`` element + tagHtml, ## the HTML ``html`` element + tagHr, ## the HTML ``hr`` element + tagI, ## the HTML ``i`` element + tagImg, ## the HTML ``img`` element + tagInput, ## the HTML ``input`` element + tagIns, ## the HTML ``ins`` element + tagKbd, ## the HTML ``kbd`` element + tagLabel, ## the HTML ``label`` element + tagLegend, ## the HTML ``legend`` element + tagLi, ## the HTML ``li`` element + tagLink, ## the HTML ``link`` element + tagMap, ## the HTML ``map`` element + tagMeta, ## the HTML ``meta`` element + tagNoscript, ## the HTML ``noscript`` element + tagObject, ## the HTML ``object`` element + tagOl, ## the HTML ``ol`` element + tagOptgroup, ## the HTML ``optgroup`` element + tagOption, ## the HTML ``option`` element + tagP, ## the HTML ``p`` element + tagParam, ## the HTML ``param`` element + tagPre, ## the HTML ``pre`` element + tagQ, ## the HTML ``q`` element + tagSamp, ## the HTML ``samp`` element + tagScript, ## the HTML ``script`` element + tagSelect, ## the HTML ``select`` element + tagSmall, ## the HTML ``small`` element + tagSpan, ## the HTML ``span`` element + tagStrong, ## the HTML ``strong`` element + tagStyle, ## the HTML ``style`` element + tagSub, ## the HTML ``sub`` element + tagSup, ## the HTML ``sup`` element + tagTable, ## the HTML ``table`` element + tagTbody, ## the HTML ``tbody`` element + tagTd, ## the HTML ``td`` element + tagTextarea, ## the HTML ``textarea`` element + tagTfoot, ## the HTML ``tfoot`` element + tagTh, ## the HTML ``th`` element + tagThead, ## the HTML ``thead`` element + tagTitle, ## the HTML ``title`` element + tagTr, ## the HTML ``tr`` element + tagTt, ## the HTML ``tt`` element + tagUl, ## the HTML ``ul`` element + tagVar ## the HTML ``var`` element + +const + tagStrs = [ + "a", "acronym", "address", "area", "b", "base", "big", "blockquote", + "body", "br", "button", "caption", "cite", "code", "col", "colgroup", + "dd", "del", "dfn", "div", "dl", "dt", "em", "fieldset", + "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "html", "hr", + "i", "img", "input", "ins", "kbd", "label", "legend", "li", "link", + "map", "meta", "noscript", "object", "ol", "optgroup", "option", + "p", "param", "pre", "q", "samp", "script", "select", "small", + "span", "strong", "style", "sub", "sup", "table", "tbody", "td", + "textarea", "tfoot", "th", "thead", "title", "tr", "tt", "ul", "var" + ] + +proc binaryStrSearch(x: openarray[string], y: string): int = + ## XXX put this into the library somewhere! + var a = 0 + var b = len(x) - 1 + while a <= b: + var mid = (a + b) div 2 + var c = cmp(x[mid], y) + if c < 0: + a = mid + 1 + elif c > 0: + b = mid - 1 + else: + return mid + result = - 1 + +proc htmlTag*(n: PXmlNode): THtmlTag = + ## gets `n`'s tag as a ``THtmlTag``. Even though results are cached, this is + ## can be more expensive than comparing ``tag`` directly to a string. + if n.clientData == 0: + n.clientData = binaryStrSearch(tagStrs, n.tag)+1 + result = THtmlTag(n.clientData) + +proc parseElement(x: var TXmlParser, doc: var PDocument): PElement = + var n = doc.createElement("") + + while True: + case x.kind() + of xmlEof: + break + of xmlElementStart: + if n.tagName() != "": + n.appendChild(parseElement(x, doc)) + else: + n = doc.createElement(x.elementName) + of xmlElementOpen: + if n.tagName() != "": + n.appendChild(parseElement(x, doc)) + else: + if x.elementName.contains(':'): + #TODO: NamespaceURI + n = doc.createElementNS("nil", x.elementName) + else: + n = doc.createElement(x.elementName) + + of xmlElementEnd: + if x.elementName == n.nodeName: + # n.normalize() # Remove any whitespace etc. + return n + else: #The wrong element is ended + raise newException(EMismatchedTag, "Mismatched tag at line " & + $x.getLine() & " column " & $x.getColumn) + + of xmlCharData: + n.appendChild(parseText(x, doc)) + of xmlAttribute: + if x.attrKey.contains(':'): + #TODO: NamespaceURI + n.setAttributeNS("nil", x.attrKey, x.attrValue) + else: + n.setAttribute(x.attrKey, x.attrValue) + of xmlCData: + n.appendChild(doc.createCDATASection(x.charData())) + of xmlComment: + n.appendChild(doc.createComment(x.charData())) + of xmlPI: + n.appendChild(doc.createProcessingInstruction(x.PIName(), x.PIRest())) + + of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial: + # Unused 'events' + + else: + raise newException(EParserError, "Unexpected XML Parser event") + x.next() + + raise newException(EMismatchedTag, + "Mismatched tag at line " & $x.getLine() & " column " & $x.getColumn) + + +proc parse*(x: var TXmlParser, father: PXmlNode) = + + +proc parseHtml*(s: PStream, filename: string, + errors: var seq[string]): PXmlNode = + ## parses the HTML from stream `s` and returns a ``PXmlNode``. Every + ## occured parsing error is added to the `errors` sequence. + var x: TXmlParser + open(x, s, filename, {reportComments}) + + result = newElement("html") + while true: + x.next() + case x.kind + of xmlWhitespace: nil # just skip it + of xmlComment: + result.add(newComment(x.text)) + + while True: + x.next() + case x.kind + of xmlEof: break + of xmlElementStart, xmlElementOpen: + var el: PElement = parseElement(x, XmlDoc) + XmlDoc = dom.createDocument(el) + of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial: + # Unused 'events' + else: + raise newException(EParserError, "Unexpected XML Parser event") + close(x) + +proc parseHtml*(s: PStream): PXmlNode = + ## parses the HTML from stream `s` and returns a ``PXmlNode``. All parsing + ## errors are ignored. + var errors: seq[string] = @[] + result = parseHtml(s, "unknown_html_doc", errors) + +proc loadHtml*(path: string, reportErrors = false): PXmlNode = + ## Loads and parses HTML from file specified by ``path``, and returns + ## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are + ## ``echo``ed. + var s = newFileStream(path, fmRead) + if s == nil: raise newException(EIO, "Unable to read file: " & path) + + var errors: seq[string] = @[] + result = parseHtml(s, path, errors) + if reportErrors: + for msg in items(errors): echo(msg) + diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim index 6809c0f7c..a209e8be0 100755 --- a/lib/pure/parsexml.nim +++ b/lib/pure/parsexml.nim @@ -364,7 +364,7 @@ proc parsePI(my: var TXmlParser) = break add(my.b, '?') inc(pos) - of '\c': + of '\c': # the specification says that CR-LF, CR are to be transformed to LF pos = lexbase.HandleCR(my, pos) buf = my.buf diff --git a/lib/pure/re.nim b/lib/pure/re.nim index 09a38440d..953f9c744 100755 --- a/lib/pure/re.nim +++ b/lib/pure/re.nim @@ -127,13 +127,13 @@ template `=~` *(s: string, pattern: TRegEx): expr = ## ## if line =~ re"\s*(\w+)\s*\=\s*(\w+)": ## # matches a key=value pair: - ## echo("Key: ", matches[1]) - ## echo("Value: ", matches[2]) + ## echo("Key: ", matches[0]) + ## echo("Value: ", matches[1]) ## elif line =~ re"\s*(\#.*)": ## # matches a comment ## # note that the implicit ``matches`` array is different from the ## # ``matches`` array of the first branch - ## echo("comment: ", matches[1]) + ## echo("comment: ", matches[0]) ## else: ## echo("syntax error") ## diff --git a/lib/pure/xmldom.nim b/lib/pure/xmldom.nim index 12578a793..4e9d721d7 100644 --- a/lib/pure/xmldom.nim +++ b/lib/pure/xmldom.nim @@ -9,33 +9,30 @@ import strutils -## This module implements the XML DOM Level 2 +## This module implements XML DOM Level 2 Core specification(http://www.w3.org/TR/2000/REC-DOM-Level-2-Core-20001113/core.html) -#http://www.w3.org/TR/2000/REC-DOM-Level-2-Core-20001113/core.html -#DOMString = String -#DOMTimeStamp = int16 ?? -#DECLARATIONS +#http://www.w3.org/TR/2000/REC-DOM-Level-2-Core-20001113/core.html #Exceptions type - EDOMException* = object of E_Base #Base exception object for all DOM Exceptions - EDOMStringSizeErr* = object of EDOMException #If the specified range of text does not fit into a DOMString - #Currently not used(Since DOMString is just string) - EHierarchyRequestErr* = object of EDOMException #If any node is inserted somewhere it doesn't belong - EIndexSizeErr* = object of EDOMException #If index or size is negative, or greater than the allowed value - EInuseAttributeErr* = object of EDOMException #If an attempt is made to add an attribute that is already in use elsewhere - EInvalidAccessErr* = object of EDOMException #If a parameter or an operation is not supported by the underlying object. - EInvalidCharacterErr* = object of EDOMException #This exception is raised when a string parameter contains an illegal character - EInvalidModificationErr* = object of EDOMException #If an attempt is made to modify the type of the underlying object. - EInvalidStateErr* = object of EDOMException #If an attempt is made to use an object that is not, or is no longer, usable. - ENamespaceErr* = object of EDOMException #If an attempt is made to create or change an object in a way which is incorrect with regard to namespaces. - ENotFoundErr* = object of EDOMException #If an attempt is made to reference a node in a context where it does not exist - ENotSupportedErr* = object of EDOMException #If the implementation does not support the requested type of object or operation. - ENoDataAllowedErr* = object of EDOMException #If data is specified for a node which does not support data - ENoModificationAllowedErr* = object of EDOMException #If an attempt is made to modify an object where modifications are not allowed - ESyntaxErr* = object of EDOMException #If an invalid or illegal string is specified. - EWrongDocumentErr* = object of EDOMException #If a node is used in a different document than the one that created it (that doesn't support it) + EDOMException* = object of E_Base ## Base exception object for all DOM Exceptions + EDOMStringSizeErr* = object of EDOMException ## If the specified range of text does not fit into a DOMString + ## Currently not used(Since DOMString is just string) + EHierarchyRequestErr* = object of EDOMException ## If any node is inserted somewhere it doesn't belong + EIndexSizeErr* = object of EDOMException ## If index or size is negative, or greater than the allowed value + EInuseAttributeErr* = object of EDOMException ## If an attempt is made to add an attribute that is already in use elsewhere + EInvalidAccessErr* = object of EDOMException ## If a parameter or an operation is not supported by the underlying object. + EInvalidCharacterErr* = object of EDOMException ## This exception is raised when a string parameter contains an illegal character + EInvalidModificationErr* = object of EDOMException ## If an attempt is made to modify the type of the underlying object. + EInvalidStateErr* = object of EDOMException ## If an attempt is made to use an object that is not, or is no longer, usable. + ENamespaceErr* = object of EDOMException ## If an attempt is made to create or change an object in a way which is incorrect with regard to namespaces. + ENotFoundErr* = object of EDOMException ## If an attempt is made to reference a node in a context where it does not exist + ENotSupportedErr* = object of EDOMException ## If the implementation does not support the requested type of object or operation. + ENoDataAllowedErr* = object of EDOMException ## If data is specified for a node which does not support data + ENoModificationAllowedErr* = object of EDOMException ## If an attempt is made to modify an object where modifications are not allowed + ESyntaxErr* = object of EDOMException ## If an invalid or illegal string is specified. + EWrongDocumentErr* = object of EDOMException ## If a node is used in a different document than the one that created it (that doesn't support it) template newException(exceptn, message: expr): expr = block: # open a new scope @@ -65,24 +62,24 @@ type Feature = tuple[name: string, version: string] PDOMImplementation* = ref DOMImplementation DOMImplementation = object - Features: seq[Feature] #Read-Only + Features: seq[Feature] # Read-Only PNode* = ref Node Node = object - attributes: seq[PAttr] #Read-only - childNodes*: seq[PNode] #Read-only - FLocalName: string #Read-only - FNamespaceURI: string #Read-only - FNodeName: string #Read-only + attributes*: seq[PAttr] + childNodes*: seq[PNode] + FLocalName: string # Read-only + FNamespaceURI: string # Read-only + FNodeName: string # Read-only nodeValue*: string - FNodeType: int #Read-only - FOwnerDocument: PDocument #Read-Only - FParentNode: PNode #Read-Only + FNodeType: int # Read-only + FOwnerDocument: PDocument # Read-Only + FParentNode: PNode # Read-Only prefix*: string # Setting this should change some values... TODO! PElement* = ref Element Element = object of Node - FTagName: string #Read-only + FTagName: string # Read-only PCharacterData = ref CharacterData CharacterData = object of Node @@ -90,15 +87,15 @@ type PDocument* = ref Document Document = object of Node - FImplementation: PDOMImplementation #Read-only - FDocumentElement: PElement #Read-only + FImplementation: PDOMImplementation # Read-only + FDocumentElement: PElement # Read-only PAttr* = ref Attr Attr = object of Node - FName: string #Read-only - FSpecified: bool #Read-only + FName: string # Read-only + FSpecified: bool # Read-only value*: string - FOwnerElement: PElement #Read-only + FOwnerElement: PElement # Read-only PDocumentFragment* = ref DocumentFragment DocumentFragment = object of Node @@ -115,18 +112,18 @@ type PProcessingInstruction* = ref ProcessingInstruction ProcessingInstruction = object of Node data*: string - FTarget: string #Read-only + FTarget: string # Read-only -#DOMImplementation +# DOMImplementation proc getDOM*(): PDOMImplementation = - ##Returns a DOMImplementation + ## Returns a DOMImplementation var DOMImpl: PDOMImplementation new(DOMImpl) DOMImpl.Features = @[(name: "core", version: "2.0"), (name: "core", version: "1.0"), (name: "XML", version: "2.0")] return DOMImpl proc createDocument*(dom: PDOMImplementation, namespaceURI: string, qualifiedName: string): PDocument = - ##Creates an XML Document object of the specified type with its document element. + ## Creates an XML Document object of the specified type with its document element. var doc: PDocument new(doc) doc.FNamespaceURI = namespaceURI @@ -142,8 +139,9 @@ proc createDocument*(dom: PDOMImplementation, namespaceURI: string, qualifiedNam return doc proc createDocument*(dom: PDOMImplementation, n: PElement): PDocument = - ##Creates an XML Document object of the specified type with its document element. - #This procedure is not in the specification, it's provided for the parser. + ## Creates an XML Document object of the specified type with its document element. + + # This procedure is not in the specification, it's provided for the parser. var doc: PDocument new(doc) doc.FDocumentElement = n @@ -153,7 +151,7 @@ proc createDocument*(dom: PDOMImplementation, n: PElement): PDocument = return doc proc hasFeature*(dom: PDOMImplementation, feature: string, version: string = ""): bool = - ##Returns ``true`` if this ``version`` of the DomImplementation implements ``feature``, otherwise ``false`` + ## Returns ``true`` if this ``version`` of the DomImplementation implements ``feature``, otherwise ``false`` for iName, iVersion in items(dom.Features): if iName == feature: if version == "": @@ -164,8 +162,8 @@ proc hasFeature*(dom: PDOMImplementation, feature: string, version: string = "") return False -#Document -#Attributes +# Document +# Attributes proc implementation*(doc: PDocument): PDOMImplementation = return doc.FImplementation @@ -173,9 +171,9 @@ proc implementation*(doc: PDocument): PDOMImplementation = proc documentElement*(doc: PDocument): PElement = return doc.FDocumentElement -#Internal procedures +# Internal procedures proc findNodes(nl: PNode, name: string): seq[PNode] = - #Made for getElementsByTagName + # Made for getElementsByTagName var r: seq[PNode] = @[] if nl.childNodes == nil: return @[] if nl.childNodes.len() == 0: return @[] @@ -192,7 +190,7 @@ proc findNodes(nl: PNode, name: string): seq[PNode] = return r proc findNodesNS(nl: PNode, namespaceURI: string, localName: string): seq[PNode] = - #Made for getElementsByTagNameNS + # Made for getElementsByTagNameNS var r: seq[PNode] = @[] if nl.childNodes == nil: return @[] if nl.childNodes.len() == 0: return @[] @@ -211,10 +209,10 @@ proc findNodesNS(nl: PNode, namespaceURI: string, localName: string): seq[PNode] #Procedures proc createAttribute*(doc: PDocument, name: string): PAttr = - ##Creates an Attr of the given name. Note that the Attr instance can then be set on an Element using the setAttributeNode method. - ##To create an attribute with a qualified name and namespace URI, use the createAttributeNS method. + ## Creates an Attr of the given name. Note that the Attr instance can then be set on an Element using the setAttributeNode method. + ## To create an attribute with a qualified name and namespace URI, use the createAttributeNS method. - #Check if name contains illegal characters + # Check if name contains illegal characters if illegalChars in name: raise newException(EInvalidCharacterErr, "Invalid character") @@ -230,12 +228,12 @@ proc createAttribute*(doc: PDocument, name: string): PAttr = return AttrNode proc createAttributeNS*(doc: PDocument, namespaceURI: string, qualifiedName: string): PAttr = - ##Creates an attribute of the given qualified name and namespace URI + ## Creates an attribute of the given qualified name and namespace URI - #Check if name contains illegal characters + # Check if name contains illegal characters if illegalChars in namespaceURI or illegalChars in qualifiedName: raise newException(EInvalidCharacterErr, "Invalid character") - #Exceptions + # Exceptions if qualifiedName.contains(':'): if namespaceURI == nil or namespaceURI == "": raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil") @@ -264,17 +262,17 @@ proc createAttributeNS*(doc: PDocument, namespaceURI: string, qualifiedName: str return AttrNode proc createCDATASection*(doc: PDocument, data: string): PCDATASection = - ##Creates a CDATASection node whose value is the specified string. + ## Creates a CDATASection node whose value is the specified string. var CData: PCDATASection new(CData) CData.data = data CData.nodeValue = data - CData.FNodeName = "#text" #Not sure about this, but this is technically a TextNode + CData.FNodeName = "#text" # Not sure about this, but this is technically a TextNode CData.FNodeType = CDataSectionNode return CData proc createComment*(doc: PDocument, data: string): PComment = - ##Creates a Comment node given the specified string. + ## Creates a Comment node given the specified string. var Comm: PComment new(Comm) Comm.data = data @@ -284,15 +282,15 @@ proc createComment*(doc: PDocument, data: string): PComment = return Comm proc createDocumentFragment*(doc: PDocument): PDocumentFragment = - ##Creates an empty DocumentFragment object. + ## Creates an empty DocumentFragment object. var DF: PDocumentFragment new(DF) return DF proc createElement*(doc: PDocument, tagName: string): PElement = - ##Creates an element of the type specified. + ## Creates an element of the type specified. - #Check if name contains illegal characters + # Check if name contains illegal characters if illegalChars in tagName: raise newException(EInvalidCharacterErr, "Invalid character") @@ -311,7 +309,7 @@ proc createElement*(doc: PDocument, tagName: string): PElement = return elNode proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: string): PElement = - ##Creates an element of the given qualified name and namespace URI. + ## Creates an element of the given qualified name and namespace URI. if qualifiedName.contains(':'): if namespaceURI == nil or namespaceURI == "": raise newException(ENamespaceErr, "When qualifiedName contains a prefix namespaceURI cannot be nil") @@ -319,7 +317,7 @@ proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: strin raise newException(ENamespaceErr, "When the namespace prefix is \"xml\" namespaceURI has to be \"http://www.w3.org/XML/1998/namespace\"") - #Check if name contains illegal characters + # Check if name contains illegal characters if illegalChars in namespaceURI or illegalChars in qualifiedName: raise newException(EInvalidCharacterErr, "Invalid character") @@ -342,7 +340,7 @@ proc createElementNS*(doc: PDocument, namespaceURI: string, qualifiedName: strin return elNode proc createProcessingInstruction*(doc: PDocument, target: string, data: string): PProcessingInstruction = - ##Creates a ProcessingInstruction node given the specified name and data strings. + ## Creates a ProcessingInstruction node given the specified name and data strings. #Check if name contains illegal characters if illegalChars in target: @@ -356,7 +354,7 @@ proc createProcessingInstruction*(doc: PDocument, target: string, data: string): return PI proc createTextNode*(doc: PDocument, data: string): PText = #Propably TextNode - ##Creates a Text node given the specified string. + ## Creates a Text node given the specified string. var txtNode: PText new(txtNode) txtNode.data = data @@ -371,8 +369,8 @@ discard """proc getElementById*(doc: PDocument, elementId: string): PElement = #TODO""" proc getElementsByTagName*(doc: PDocument, tagName: string): seq[PNode] = - ##Returns a NodeList of all the Elements with a given tag name in - ##the order in which they are encountered in a preorder traversal of the Document tree. + ## Returns a NodeList of all the Elements with a given tag name in + ## the order in which they are encountered in a preorder traversal of the Document tree. var result: seq[PNode] = @[] if doc.FDocumentElement.FNodeName == tagName or tagName == "*": result.add(doc.FDocumentElement) @@ -381,8 +379,8 @@ proc getElementsByTagName*(doc: PDocument, tagName: string): seq[PNode] = return result proc getElementsByTagNameNS*(doc: PDocument, namespaceURI: string, localName: string): seq[PNode] = - ##Returns a NodeList of all the Elements with a given localName and namespaceURI - ##in the order in which they are encountered in a preorder traversal of the Document tree. + ## Returns a NodeList of all the Elements with a given localName and namespaceURI + ## in the order in which they are encountered in a preorder traversal of the Document tree. var result: seq[PNode] = @[] if doc.FDocumentElement.FLocalName == localName or localName == "*": if doc.FDocumentElement.FNamespaceURI == namespaceURI or namespaceURI == "*": @@ -450,57 +448,76 @@ proc importNode*(doc: PDocument, importedNode: PNode, deep: bool): PNode = # Node # Attributes -proc Attributes*(n: PNode): seq[PAttr] = - if n.attributes == nil: n.attributes = @[] # Initialize the sequence if it's nil - return n.attributes proc firstChild*(n: PNode): PNode = + ## Returns this node's first child + if n.childNodes.len() > 0: return n.childNodes[0] else: return nil proc lastChild*(n: PNode): PNode = + ## Returns this node's last child + if n.childNodes.len() > 0: return n.childNodes[n.childNodes.len() - 1] else: return nil proc localName*(n: PNode): string = + ## Returns this nodes local name + return n.FLocalName proc namespaceURI*(n: PNode): string = + ## Returns this nodes namespace URI + return n.FNamespaceURI proc nextSibling*(n: PNode): PNode = + ## Returns the next sibling of this node + var nLow: int = low(n.FParentNode.childNodes) var nHigh: int = high(n.FParentNode.childNodes) for i in nLow..nHigh: - if n.FParentNode.childNodes[i] == n: # HAVE TO TEST this line, not sure if ``==`` will work + if n.FParentNode.childNodes[i] == n: return n.FParentNode.childNodes[i + 1] return nil proc nodeName*(n: PNode): string = + ## Returns the name of this node + return n.FNodeName proc nodeType*(n: PNode): int = + ## Returns the type of this node + return n.FNodeType proc ownerDocument*(n: PNode): PDocument = + ## Returns the owner document of this node + return n.FOwnerDocument proc parentNode*(n: PNode): PNode = + ## Returns the parent node of this node + return n.FParentNode proc previousSibling*(n: PNode): PNode = + ## Returns the previous sibling of this node + var nLow: int = low(n.FParentNode.childNodes) var nHigh: int = high(n.FParentNode.childNodes) for i in nLow..nHigh: - if n.FParentNode.childNodes[i] == n: # HAVE TO TEST this line, not sure if ``==`` will work + if n.FParentNode.childNodes[i] == n: return n.FParentNode.childNodes[i - 1] return nil proc `prefix=`*(n: var PNode, value: string) = + ## Modifies the prefix of this node + # Setter # Check if name contains illegal characters if illegalChars in value: @@ -532,8 +549,11 @@ proc appendChild*(n: PNode, newChild: PNode) = ## Adds the node newChild to the end of the list of children of this node. ## If the newChild is already in the tree, it is first removed. - # TODO - Check if n contains newChild - # TODO - Exceptions + # Check if n contains newChild + if n.childNodes != nil: + for i in low(n.childNodes)..high(n.childNodes): + if n.childNodes[i] == newChild: + raise newException(EHierarchyRequestErr, "The node to append is already in this nodes children.") # Check if newChild is from this nodes document if n.FOwnerDocument != newChild.FOwnerDocument: @@ -542,6 +562,9 @@ proc appendChild*(n: PNode, newChild: PNode) = if n == newChild: raise newException(EHierarchyRequestErr, "You can't add a node into itself") + if n.nodeType in childlessObjects: + raise newException(ENoModificationAllowedErr, "Cannot append children to a childless node") + if n.childNodes == nil: n.childNodes = @[] newChild.FParentNode = n @@ -604,10 +627,43 @@ proc isSupported*(n: PNode, feature: string, version: string): bool = ## feature and that feature is supported by this node. return n.FOwnerDocument.FImplementation.hasFeature(feature, version) +proc isEmpty(s: string): bool = + + if s == "" or s == nil: + return True + for i in items(s): + if i != ' ': + return False + return True + proc normalize*(n: PNode) = - ## Puts all Text nodes in the full depth of the sub-tree underneath this Node + ## Merges all seperated TextNodes together, and removes any empty TextNodes + var curTextNode: PNode = nil + var i: int = 0 - # TODO + var newChildNodes: seq[PNode] = @[] + while True: + if i >= n.childNodes.len: + break + if n.childNodes[i].nodeType == TextNode: + + #If the TextNode is empty, remove it + if PText(n.childNodes[i]).data.isEmpty(): + inc(i) + + if curTextNode == nil: + curTextNode = n.childNodes[i] + else: + PText(curTextNode).data.add(PText(n.childNodes[i]).data) + curTextNode.nodeValue.add(PText(n.childNodes[i]).data) + inc(i) + else: + newChildNodes.add(curTextNode) + newChildNodes.add(n.childNodes[i]) + curTextNode = nil + + inc(i) + n.childNodes = newChildNodes proc removeChild*(n: PNode, oldChild: PNode): PNode = ## Removes the child node indicated by ``oldChild`` from the list of children, and returns it. @@ -791,26 +847,32 @@ proc setNamedItemNS*(NList: var seq[PAttr], arg: PAttr): PAttr = NList[index] = arg return item # Return the replaced node -# TODO - Maybe implement a ChildlessNode!^ - # CharacterData - Decided to implement this, # Didn't add the procedures, because you can just edit .data # Attr # Attributes proc name*(a: PAttr): string = + ## Returns the name of the Attribute + return a.FName proc specified*(a: PAttr): bool = + ## Specifies whether this attribute was specified in the original document + return a.FSpecified proc ownerElement*(a: PAttr): PElement = + ## Returns this Attributes owner element + return a.FOwnerElement # Element # Attributes proc tagName*(el: PElement): string = + ## Returns the Element Tag Name + return el.FTagName # Procedures @@ -960,11 +1022,29 @@ proc setAttributeNS*(el: PElement, namespaceURI, localName, value: string) = proc splitData*(TextNode: PText, offset: int): PText = ## Breaks this node into two nodes at the specified offset, ## keeping both in the tree as siblings. + + if offset > TextNode.data.len(): + raise newException(EIndexSizeErr, "Index out of bounds") + + var left: string = TextNode.data.copy(0, offset) + TextNode.data = left + var right: string = TextNode.data.copy(offset, TextNode.data.len()) + + if TextNode.FParentNode != nil: + for i in low(TextNode.FParentNode.childNodes)..high(TextNode.FParentNode.childNodes): + if TextNode.FParentNode.childNodes[i] == TextNode: + var newNode: PText = TextNode.FOwnerDocument.createTextNode(right) + TextNode.FParentNode.childNodes.insert(newNode, i) + return newNode + else: + var newNode: PText = TextNode.FOwnerDocument.createTextNode(right) + return newNode - # TODO - need insert(seq[T]) # ProcessingInstruction -proc target*(PI: PProcessingInstruction): string = +proc target*(PI: PProcessingInstruction): string = + ## Returns the Processing Instructions target + return PI.FTarget diff --git a/lib/pure/xmldomparser.nim b/lib/pure/xmldomparser.nim index 90d4d85b1..9df60cab8 100644 --- a/lib/pure/xmldomparser.nim +++ b/lib/pure/xmldomparser.nim @@ -16,6 +16,7 @@ import xmldom, os, streams, parsexml, strutils type #Parsing errors EMismatchedTag* = object of E_Base ## Raised when a tag is not properly closed + EParserError* = object of E_Base ## Raised when an unexpected XML Parser event occurs template newException(exceptn, message: expr): expr = block: # open a new scope @@ -52,6 +53,7 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement = of xmlElementEnd: if x.elementName == n.nodeName: + # n.normalize() # Remove any whitespace etc. return n else: #The wrong element is ended raise newException(EMismatchedTag, "Mismatched tag at line " & @@ -71,8 +73,12 @@ proc parseElement(x: var TXmlParser, doc: var PDocument): PElement = n.appendChild(doc.createComment(x.charData())) of xmlPI: n.appendChild(doc.createProcessingInstruction(x.PIName(), x.PIRest())) + + of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial: + # Unused 'events' + else: - # echo(x.kind()) # XXX do nothing here!? + raise newException(EParserError, "Unexpected XML Parser event") x.next() raise newException(EMismatchedTag, @@ -99,9 +105,12 @@ proc loadXML*(path: string): PDocument = of xmlElementStart, xmlElementOpen: var el: PElement = parseElement(x, XmlDoc) XmlDoc = dom.createDocument(el) + of xmlWhitespace, xmlElementClose, xmlEntity, xmlSpecial: + # Unused 'events' else: - # echo(x.kind()) + raise newException(EParserError, "Unexpected XML Parser event") + close(x) return XmlDoc diff --git a/lib/pure/xmlgen.nim b/lib/pure/xmlgen.nim index 79a782252..29f2700f2 100755 --- a/lib/pure/xmlgen.nim +++ b/lib/pure/xmlgen.nim @@ -21,6 +21,10 @@ ## ## <h1><a href="http://force7.de/nimrod">Nimrod</a></h1> ## +## **Deprecated since version 0.8.8.** Use the macro ``<>`` in xmltree +## instead. + +{.deprecated.} import macros, strutils @@ -52,8 +56,8 @@ proc xmlCheckedTag*(e: PNimrodNode, tag: string, # copy the attributes; when iterating over them these lists # will be modified, so that each attribute is only given one value - var req = splitSeq(reqAttr) - var opt = splitSeq(optAttr) + var req = split(reqAttr) + var opt = split(optAttr) result = newNimNode(nnkBracket, e) result.add(newStrLitNode("<")) result.add(newStrLitNode(tag)) diff --git a/lib/pure/xmltree.nim b/lib/pure/xmltree.nim new file mode 100644 index 000000000..aeec842d7 --- /dev/null +++ b/lib/pure/xmltree.nim @@ -0,0 +1,231 @@ +# +# +# Nimrod's Runtime Library +# (c) Copyright 2010 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## A simple XML tree. More efficient and simpler than the DOM. + +import macros, strtabs + +type + PXmlNode* = ref TXmlNode ## an XML tree consists of ``PXmlNode``s. + + TXmlNodeKind* = enum ## different kinds of ``PXmlNode``s + xnText, ## a text element + xnElement, ## an element with 0 or more children + xnCData, ## a CDATA node + xnComment ## an XML comment + + PXmlAttributes* = PStringTable ## an alias for a string to string mapping + + TXmlNode {.pure, final, acyclic.} = object + case k: TXmlNodeKind + of xnText, xnComment, xnCData: + fText: string + of xnElement: + fTag: string + s: seq[PXmlNode] + fAttr: PXmlAttributes + fClientData: int ## for other clients + +proc newXmlNode(kind: TXmlNodeKind): PXmlNode = + ## creates a new ``PXmlNode``. + new(result) + result.k = kind + +proc newElement*(tag: string): PXmlNode = + ## creates a new ``PXmlNode``. of kind ``xnText`` with the given `tag`. + result = newXmlNode(xnElement) + result.fTag = tag + result.s = @[] + # init attributes lazily to safe memory + +proc newText*(text: string): PXmlNode = + ## creates a new ``PXmlNode`` of kind ``xnText`` with the text `text`. + result = newXmlNode(xnText) + result.fText = text + +proc newComment*(comment: string): PXmlNode = + ## creates a new ``PXmlNode`` of kind ``xnComment`` with the text `comment`. + result = newXmlNode(xnComment) + result.fText = comment + +proc newCData*(cdata: string): PXmlNode = + ## creates a new ``PXmlNode`` of kind ``xnComment`` with the text `cdata`. + result = newXmlNode(xnCData) + result.fText = cdata + +proc text*(n: PXmlNode): string {.inline.} = + ## gets the associated text with the node `n`. `n` can be a CDATA, Text + ## or comment node. + assert n.k in {xnText, xnComment, xnCData} + result = n.fText + +proc tag*(n: PXmlNode): string {.inline.} = + ## gets the tag name of `n`. `n` has to be an ``xnElement`` node. + assert n.k == xnElement + result = n.fTag + +proc add*(father, son: PXmlNode) {.inline.} = + ## adds the child `son` to `father`. + add(father.s, son) + +proc len*(n: PXmlNode): int {.inline.} = + ## returns the number `n`'s children. + if n.k == xnElement: result = len(n.s) + +proc kind*(n: PXmlNode): TXmlNodeKind {.inline.} = + ## returns `n`'s kind. + result = n.k + +proc `[]`* (n: PXmlNode, i: int): PXmlNode {.inline.} = + ## returns the `i`'th child of `n`. + assert n.k == xnElement + result = n.s[i] + +iterator items*(n: PXmlNode): PXmlNode {.inline.} = + ## iterates over any child of `n`. + assert n.k == xnElement + for i in 0 .. n.len-1: yield n[i] + +proc attr*(n: PXmlNode): PXmlAttributes {.inline.} = + ## gets the attributes belonging to `n`. + assert n.k == xnElement + result = n.fAttr + +proc `attr=`*(n: PXmlNode, attr: PXmlAttributes) {.inline.} = + ## sets the attributes belonging to `n`. + assert n.k == xnElement + n.fAttr = attr + +proc attrLen*(n: PXmlNode): int {.inline.} = + ## returns the number of `n`'s attributes. + assert n.k == xnElement + if not isNil(n.fAttr): result = len(n.fAttr) + +proc clientData*(n: PXmlNode): int {.inline.} = + ## gets the client data of `n`. The client data field is used by the HTML + ## parser and generator. + result = n.fClientData + +proc `clientData=`*(n: PXmlNode, data: int) {.inline.} = + ## sets the client data of `n`. The client data field is used by the HTML + ## parser and generator. + n.fClientData = data + +proc addEscaped*(result: var string, s: string) = + ## same as ``result.add(escape(s))``, but more efficient. + for c in items(s): + case c + of '<': result.add("<") + of '>': result.add(">") + of '&': result.add("&") + of '"': result.add(""") + else: result.add(c) + +proc escape*(s: string): string = + ## escapes `s` for inclusion into an XML document. + ## Escapes these characters: + ## + ## ------------ ------------------- + ## char is converted to + ## ------------ ------------------- + ## ``<`` ``<`` + ## ``>`` ``>`` + ## ``&`` ``&`` + ## ``"`` ``"`` + ## ------------ ------------------- + result = newString(s.len) + setLen(result, 0) + addEscaped(result, s) + +proc addIndent(result: var string, indent: int) = + result.add("\n") + for i in 1..indent: result.add(' ') + +proc add*(result: var string, n: PXmlNode, indent = 0, indWidth = 2) = + ## adds the textual representation of `n` to `result`. + case n.k + of xnElement: + result.add('<') + result.add(n.fTag) + if not isNil(n.fAttr): + for key, val in pairs(n.fAttr): + result.add(' ') + result.add(key) + result.add("=\"") + result.addEscaped(val) + result.add('"') + if n.len > 0: + result.add('>') + for i in 0..n.len-1: + result.addIndent(indent+indWidth) + result.add(n[i], indent+indWidth, indWidth) + result.addIndent(indent) + result.add("</") + result.add(n.fTag) + result.add(">") + else: + result.add(" />") + of xnText: + result.addEscaped(n.fText) + of xnComment: + result.add("<!-- ") + result.addEscaped(n.fText) + result.add(" -->") + of xnCDATA: + result.add("<![CDATA[") + result.add(n.fText) + result.add("]]>") + +proc `$`*(n: PXmlNode): string = + ## converts `n` into its string representation. + result = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n" + result.add(n) + +proc newXmlTree*(tag: string, children: openArray[PXmlNode], + attributes: PXmlAttributes = nil): PXmlNode = + ## creates a new XML tree with `tag`, `children` and `attributes` + result = newXmlNode(xnElement) + result.fTag = tag + newSeq(result.s, children.len) + for i in 0..children.len-1: result.s[i] = children[i] + result.fAttr = attributes + +proc xmlConstructor(e: PNimrodNode): PNimrodNode {.compileTime.} = + ## use this procedure to define a new XML tag + expectLen(e, 1) + var a = e[0] + if a.kind == nnkCall: + result = newCall("newXmlTree", toStrLit(a[0])) + var attrs = newCall("newStringTable", []) + var bracket = newNimNode(nnkBracket, a) + for i in 1..a.len-1: + if a[i].kind == nnkExprEqExpr: + attrs.add(toStrLit(a[i][0])) + attrs.add(a[i][1]) + else: + bracket.add(a[i]) + result.add(bracket) + if attrs.len > 1: result.add(attrs) + else: + result = newCall("newXmlTree", toStrLit(a)) + +macro `<>`*(x: expr): expr = + ## Constructor macro for XML. Example usage: + ## + ## .. code-block:: nimrod + ## <>a(href="http://force7.de/nimrod", "Nimrod rules.") + ## + ## Produces an XML tree for:: + ## + ## <a href="http://force7.de/nimrod">Nimrod rules.</a> + ## + result = xmlConstructor(x) + + + diff --git a/lib/pure/xmltreeparser.nim b/lib/pure/xmltreeparser.nim new file mode 100644 index 000000000..5a48f9e8b --- /dev/null +++ b/lib/pure/xmltreeparser.nim @@ -0,0 +1,52 @@ +# +# +# Nimrod's Runtime Library +# (c) Copyright 2010 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module parses an XML document and creates its XML tree representation. + +import streams, parsexml, xmltree + + +proc parse*(x: var TXmlParser, father: PXmlNode) = + + +proc parseXml*(s: PStream, filename: string, + errors: var seq[string]): PXmlNode = + ## parses the XML from stream `s` and returns a ``PXmlNode``. Every + ## occured parsing error is added to the `errors` sequence. + var x: TXmlParser + open(x, s, filename, {reportComments}) + + result = newElement("html") + while true: + x.next() + case x.kind + of xmlWhitespace: nil # just skip it + of xmlComment: + result.add(newComment(x.text)) + + close(x) + +proc parseXml*(s: PStream): PXmlNode = + ## parses the XTML from stream `s` and returns a ``PXmlNode``. All parsing + ## errors are ignored. + var errors: seq[string] = @[] + result = parseXml(s, "unknown_html_doc", errors) + +proc loadXml*(path: string, reportErrors = false): PXmlNode = + ## Loads and parses XML from file specified by ``path``, and returns + ## a ``PXmlNode``. If `reportErrors` is true, the parsing errors are + ## ``echo``ed. + var s = newFileStream(path, fmRead) + if s == nil: raise newException(EIO, "Unable to read file: " & path) + + var errors: seq[string] = @[] + result = parseXml(s, path, errors) + if reportErrors: + for msg in items(errors): echo(msg) + diff --git a/lib/system.nim b/lib/system.nim index 79c014a7c..ae9d6b432 100755 --- a/lib/system.nim +++ b/lib/system.nim @@ -1257,7 +1257,9 @@ proc echo*[Ty](x: openarray[Ty]) {.magic: "Echo".} ## equivalent to ``writeln(stdout, x); flush(stdout)``. BUT: This is ## available for the ECMAScript target too! -template newException(exceptn, message: expr): expr = +template newException*(exceptn, message: expr): expr = + ## creates an exception object of type "exceptn" and sets its ``msg`` field + ## to `message`. Returns the new exception object. block: # open a new scope var e: ref exceptn diff --git a/web/news.txt b/web/news.txt index 096ee8ba9..b4f3551d1 100755 --- a/web/news.txt +++ b/web/news.txt @@ -26,6 +26,7 @@ Additions - Added ``system.cstringArrayToSeq``. - Added ``system.lines(f: TFile)`` iterator. - Added ``system.delete``, ``system.del`` and ``system.insert`` for sequences. +- Exported ``system.newException`` template. - Added ``cgi.decodeData(data: string): tuple[key, value: string]``. - Added ``ropes`` module. - Added ``sockets`` module. @@ -36,6 +37,9 @@ Additions - Added ``unidecode`` module. - Added ``xmldom`` module. - Added ``xmldomparser`` module. +- Added ``xmltree`` module. +- Added ``xmltreeparser`` module. +- Added ``htmlparser`` module. - Many wrappers now do not contain redundant name prefixes (like ``GTK_``, ``lua``). The new wrappers are available in ``lib/newwrap``. Change your configuration file to use these. @@ -100,7 +104,7 @@ Changes affecting backwards compatibility - The compiler does not skip the linking step anymore even if no file has changed. - ``os.splitFile(".xyz")`` now returns ``("", ".xyz", "")`` instead of - ``("", "", ".xyz")``. Filenames starting with a dot are handled + ``("", "", ".xyz")``. So filenames starting with a dot are handled differently. - ``strutils.split(s: string, seps: set[char])`` never yields the empty string anymore. This behaviour is probably more appropriate for whitespace splitting. |