diff options
-rwxr-xr-x | lib/impure/re.nim | 18 | ||||
-rwxr-xr-x | lib/pure/htmlparser.nim | 17 | ||||
-rwxr-xr-x | lib/pure/parsexml.nim | 2 | ||||
-rwxr-xr-x | todo.txt | 5 | ||||
-rwxr-xr-x | web/news.txt | 16 |
5 files changed, 54 insertions, 4 deletions
diff --git a/lib/impure/re.nim b/lib/impure/re.nim index a0d88d82f..a3fb86332 100755 --- a/lib/impure/re.nim +++ b/lib/impure/re.nim @@ -136,6 +136,24 @@ proc find*(s: string, pattern: TRegEx, start = 0): int = ## match, -1 is returned. var matches: array[0..maxSubpatterns-1, string] result = find(s, pattern, matches, start) + +iterator findAll*(s: string, pattern: TRegEx, start = 0): string = + ## yields all matching captures of pattern in `s`. + var matches: array[0..MaxSubpatterns-1, string] + var i = start + while true: + var j = find(s, pattern, matches, i) + if j < 0: break + i = j + for k in 0..maxSubPatterns-1: + if isNil(matches[k]): break + inc(i, matches[k].len) + yield matches[k] + +proc findAll*(s: string, pattern: TRegEx, start = 0): seq[string] = + ## returns all matching captures of pattern in `s`. + ## If it does not match, @[] is returned. + accumulateResult(findAll(s, pattern, start)) template `=~` *(s: string, pattern: TRegEx): expr = ## This calls ``match`` with an implicit declared ``matches`` array that diff --git a/lib/pure/htmlparser.nim b/lib/pure/htmlparser.nim index 104c53833..d84688be6 100755 --- a/lib/pure/htmlparser.nim +++ b/lib/pure/htmlparser.nim @@ -269,6 +269,7 @@ proc expected(x: var TXmlParser, n: PXmlNode): string = proc untilElementEnd(x: var TXmlParser, result: PXmlNode, errors: var seq[string]) = + # we parsed e.g. ``<br>`` and don't really expect a ``</br>``: if result.htmlTag in singleTags: if x.kind != xmlElementEnd or cmpIgnoreCase(x.elementName, result.tag) != 0: return @@ -277,10 +278,15 @@ proc untilElementEnd(x: var TXmlParser, result: PXmlNode, of xmlElementStart, xmlElementOpen: case result.htmlTag of tagLi, tagP, tagDt, tagDd, tagInput, tagOption: - if htmlTag(x.elementName) notin InlineTags: - # some tags are common to have no ``</end>``, like ``<li>``: + # some tags are common to have no ``</end>``, like ``<li>``: + if htmlTag(x.elementName) in {tagLi, tagP, tagDt, tagDd, tagInput, + tagOption}: errors.add(expected(x, result)) break + when false: + if htmlTag(x.elementName) notin InlineTags: + errors.add(expected(x, result)) + break of tagTr, tagTd, tagTh, tagTfoot, tagThead: if htmlTag(x.elementName) in {tagTr, tagTd, tagTh, tagTfoot, tagThead}: errors.add(expected(x, result)) @@ -367,9 +373,14 @@ proc parseHtml*(s: PStream, filename: string, # skip the DOCTYPE: if x.kind == xmlSpecial: next(x) result = parse(x, errors) - while x.kind != xmlEof: + if x.kind != xmlEof: errors.add(errorMsg(x, "EOF expected")) + while x.kind != xmlEof: + var oldPos = x.bufpos # little hack to see if we made any progess result.addNode(parse(x, errors)) + if x.bufpos == oldPos: + # force progress! + next(x) close(x) proc parseHtml*(s: PStream): PXmlNode = diff --git a/lib/pure/parsexml.nim b/lib/pure/parsexml.nim index 598ae6c68..8551dda90 100755 --- a/lib/pure/parsexml.nim +++ b/lib/pure/parsexml.nim @@ -53,7 +53,7 @@ import # the parser treats ``<br />`` as ``<br></br>`` -## xmlElementCloseEnd, ## ``/>`` +# xmlElementCloseEnd, ## ``/>`` type TXmlEventKind* = enum ## enumation of all events that may occur when parsing diff --git a/todo.txt b/todo.txt index 9240286be..2227a13fc 100755 --- a/todo.txt +++ b/todo.txt @@ -1,3 +1,8 @@ +- fix HTML parser bug: otherbug.html! +- pegs: words are only special for definitions! +- pegs: findAll + + High priority (version 0.9.0) ============================= diff --git a/web/news.txt b/web/news.txt index 365f1b35c..337ed21bb 100755 --- a/web/news.txt +++ b/web/news.txt @@ -2,6 +2,22 @@ News ==== +2010-XX-XX Version 0.8.12 released +================================== + +Bugfixes +-------- + +- Bugfix: ``httpclient`` correct passes the path starting with ``/``. +- Bugfixes for the ``htmlparser`` module. + + +Additions +--------- + +- Added ``re.findAll``. + + 2010-10-20 Version 0.8.10 released ================================== |