summary refs log tree commit diff stats
path: root/lib
diff options
context:
space:
mode:
authorAraq <rumpf_a@web.de>2010-10-30 23:36:36 +0200
committerAraq <rumpf_a@web.de>2010-10-30 23:36:36 +0200
commit11d3d94438056dd98cf4a49411b18c61bfc0cc56 (patch)
tree5ef639b5ba2ac63c29097e5c43a5156692367695 /lib
parentd11a62af3142751cb4bfb5ebfaf1e749d9d828ac (diff)
downloadNim-11d3d94438056dd98cf4a49411b18c61bfc0cc56.tar.gz
bugfixes: htmlparser module
Diffstat (limited to 'lib')
-rwxr-xr-xlib/impure/re.nim18
-rwxr-xr-xlib/pure/htmlparser.nim17
-rwxr-xr-xlib/pure/parsexml.nim2
3 files changed, 33 insertions, 4 deletions
diff --git a/lib/impure/re.nim b/lib/impure/re.nim
index a0d88d82f..a3fb86332 100755
--- a/lib/impure/re.nim
+++ b/lib/impure/re.nim
@@ -136,6 +136,24 @@ proc find*(s: string, pattern: TRegEx, start = 0): int =
   ## match, -1 is returned.
   var matches: array[0..maxSubpatterns-1, string]
   result = find(s, pattern, matches, start)
+  
+iterator findAll*(s: string, pattern: TRegEx, start = 0): string = 
+  ## yields all matching captures of pattern in `s`.
+  var matches: array[0..MaxSubpatterns-1, string]
+  var i = start
+  while true: 
+    var j = find(s, pattern, matches, i)
+    if j < 0: break
+    i = j
+    for k in 0..maxSubPatterns-1: 
+      if isNil(matches[k]): break
+      inc(i, matches[k].len)
+      yield matches[k]
+
+proc findAll*(s: string, pattern: TRegEx, start = 0): seq[string] = 
+  ## returns all matching captures of pattern in `s`.
+  ## If it does not match, @[] is returned.
+  accumulateResult(findAll(s, pattern, start))
 
 template `=~` *(s: string, pattern: TRegEx): expr = 
   ## This calls ``match`` with an implicit declared ``matches`` array that 
diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim
index 104c53833..d84688be6 100755
--- a/lib/pure/htmlparser.nim
+++ b/lib/pure/htmlparser.nim
@@ -269,6 +269,7 @@ proc expected(x: var TXmlParser, n: PXmlNode): string =
 
 proc untilElementEnd(x: var TXmlParser, result: PXmlNode, 
                      errors: var seq[string]) =
+  # we parsed e.g. ``<br>`` and don't really expect a ``</br>``: 
   if result.htmlTag in singleTags:
     if x.kind != xmlElementEnd or cmpIgnoreCase(x.elementName, result.tag) != 0:
       return
@@ -277,10 +278,15 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode,
     of xmlElementStart, xmlElementOpen:
       case result.htmlTag
       of tagLi, tagP, tagDt, tagDd, tagInput, tagOption:
-        if htmlTag(x.elementName) notin InlineTags:
-          # some tags are common to have no ``</end>``, like ``<li>``:
+        # some tags are common to have no ``</end>``, like ``<li>``:
+        if htmlTag(x.elementName) in {tagLi, tagP, tagDt, tagDd, tagInput,
+                                      tagOption}:
           errors.add(expected(x, result))
           break
+        when false:
+          if htmlTag(x.elementName) notin InlineTags:
+            errors.add(expected(x, result))
+            break
       of tagTr, tagTd, tagTh, tagTfoot, tagThead:
         if htmlTag(x.elementName) in {tagTr, tagTd, tagTh, tagTfoot, tagThead}:
           errors.add(expected(x, result))
@@ -367,9 +373,14 @@ proc parseHtml*(s: PStream, filename: string,
   # skip the DOCTYPE:
   if x.kind == xmlSpecial: next(x)
   result = parse(x, errors)
-  while x.kind != xmlEof:
+  if x.kind != xmlEof:
     errors.add(errorMsg(x, "EOF expected"))
+  while x.kind != xmlEof:
+    var oldPos = x.bufpos # little hack to see if we made any progess
     result.addNode(parse(x, errors))
+    if x.bufpos == oldPos: 
+      # force progress!
+      next(x) 
   close(x)
 
 proc parseHtml*(s: PStream): PXmlNode = 
diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim
index 598ae6c68..8551dda90 100755
--- a/lib/pure/parsexml.nim
+++ b/lib/pure/parsexml.nim
@@ -53,7 +53,7 @@ import
 
 # the parser treats ``<br />`` as ``<br></br>``
 
-##  xmlElementCloseEnd, ## ``/>`` 
+#  xmlElementCloseEnd, ## ``/>`` 
 
 type 
   TXmlEventKind* = enum ## enumation of all events that may occur when parsing