diff options
-rwxr-xr-x | doc/pegdocs.txt | 8 | ||||
-rwxr-xr-x | lib/impure/re.nim | 18 | ||||
-rwxr-xr-x | lib/pure/pegs.nim | 55 | ||||
-rwxr-xr-x | lib/pure/strutils.nim | 4 | ||||
-rwxr-xr-x | lib/system.nim | 8 | ||||
-rwxr-xr-x | tools/nimgrep.nim | 126 | ||||
-rwxr-xr-x | web/news.txt | 1 |
7 files changed, 174 insertions, 46 deletions
diff --git a/doc/pegdocs.txt b/doc/pegdocs.txt index 05a7fdc58..eb7f4562f 100755 --- a/doc/pegdocs.txt +++ b/doc/pegdocs.txt @@ -27,7 +27,11 @@ notation meaning ``{E}`` Capture: Apply expression `E` and store the substring that matched `E` into a *capture* that can be accessed after the matching process. -``$i`` back reference to the ``i``th capture. ``i`` counts from 1. +``$i`` Back reference to the ``i``th capture. ``i`` counts from 1. +``$`` Anchor: Matches at the end of the input. No character + is consumed. Same as ``!.``. +``^`` Anchor: Matches at the start of the input. No character + is consumed. ``&E`` And predicate: Indicate success if expression `E` matches the text ahead; otherwise indicate failure. Do not consume any text. @@ -145,7 +149,7 @@ The PEG parser implements this grammar (written in PEG syntax):: rule <- identifier \s* "<-" expr ig identNoArrow <- identifier !(\s* "<-") prefixOpr <- ig '&' / ig '!' / ig '@' / ig '{@}' / ig '@@' - literal <- ig identifier? '$' [0-9]+ + literal <- ig identifier? '$' [0-9]+ / '$' / '^' / ig identNoArrow / ig charset / ig stringlit / diff --git a/lib/impure/re.nim b/lib/impure/re.nim index bce786087..9198a5bfe 100755 --- a/lib/impure/re.nim +++ b/lib/impure/re.nim @@ -80,6 +80,24 @@ proc matchOrFind(s: string, pattern: TRegEx, matches: var openarray[string], if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1) else: matches[i-1] = "" return rawMatches[1] - rawMatches[0] + +proc findBounds*(s: string, pattern: TRegEx, matches: var openarray[string], + start = 0): tuple[first, last: int] = + ## returns the starting position and end position of ``pattern`` in ``s`` + ## and the captured + ## substrings in the array ``matches``. If it does not match, nothing + ## is written into ``matches`` and (-1,0) is returned. + var + rawMatches: array[0..maxSubpatterns * 3 - 1, cint] + res = pcre.Exec(pattern.h, nil, s, len(s), start, 0'i32, + cast[ptr cint](addr(rawMatches)), maxSubpatterns * 3) + if res < 0'i32: return (-1, 0) + for i in 1..int(res)-1: + var a = rawMatches[i * 2] + var b = rawMatches[i * 2 + 1] + if a >= 0'i32: matches[i-1] = copy(s, int(a), int(b)-1) + else: matches[i-1] = "" + return (rawMatches[0].int, rawMatches[1].int - 1) proc matchOrFind(s: string, pattern: TRegEx, start, flags: cint): cint = var rawMatches: array [0..maxSubpatterns * 3 - 1, cint] diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim index 4628a3ff9..de968bff4 100755 --- a/lib/pure/pegs.nim +++ b/lib/pure/pegs.nim @@ -1,7 +1,7 @@ # # # Nimrod's Runtime Library -# (c) Copyright 2010 Andreas Rumpf +# (c) Copyright 2011 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. @@ -65,7 +65,8 @@ type pkSearch, ## @a --> Internal DSL: @a pkCapturedSearch, ## {@} a --> Internal DSL: @@a pkRule, ## a <- b - pkList ## a, b + pkList, ## a, b + pkStartAnchor ## ^ --> Internal DSL: startAnchor() TNonTerminalFlag = enum ntDeclared, ntUsed TNonTerminal {.final.} = object ## represents a non terminal symbol @@ -264,6 +265,14 @@ proc UnicodeWhitespace*: TPeg {.inline.} = ## whitespace character. result.kind = pkWhitespace +proc startAnchor*: TPeg {.inline.} = + ## constructs the PEG ``^`` which matches the start of the input. + result.kind = pkStartAnchor + +proc endAnchor*: TPeg {.inline.} = + ## constructs the PEG ``$`` which matches the end of the input. + result = !any() + proc capture*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsCapture".} = ## constructs a capture with the PEG `a` result.kind = pkCapture @@ -484,6 +493,8 @@ proc toStrAux(r: TPeg, res: var string) = for i in 0 .. high(r.sons): toStrAux(r.sons[i], res) add(res, "\n") + of pkStartAnchor: + add(res, '^') proc `$` *(r: TPeg): string {.nosideEffect, rtl, extern: "npegsToString".} = ## converts a PEG to its string representation @@ -496,6 +507,7 @@ type TCaptures* {.final.} = object ## contains the captured substrings. matches: array[0..maxSubpatterns-1, tuple[first, last: int]] ml: int + origStart: int proc bounds*(c: TCaptures, i: range[0..maxSubpatterns-1]): tuple[first, last: int] = @@ -721,6 +733,9 @@ proc rawMatch*(s: string, p: TPeg, start: int, c: var TCaptures): int {. n.kind = succ(pkTerminal, ord(p.kind)-ord(pkBackRef)) n.term = s.copy(a, b) result = rawMatch(s, n, start, c) + of pkStartAnchor: + if c.origStart == start: result = 0 + else: result = -1 of pkRule, pkList: assert false proc match*(s: string, pattern: TPeg, matches: var openarray[string], @@ -730,6 +745,7 @@ proc match*(s: string, pattern: TPeg, matches: var openarray[string], ## match, nothing is written into ``matches`` and ``false`` is ## returned. var c: TCaptures + c.origStart = start result = rawMatch(s, pattern, start, c) == len(s) -start if result: for i in 0..c.ml-1: @@ -739,6 +755,7 @@ proc match*(s: string, pattern: TPeg, start = 0): bool {.nosideEffect, rtl, extern: "npegs$1".} = ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``. var c: TCaptures + c.origStart = start result = rawMatch(s, pattern, start, c) == len(s)-start proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string], @@ -748,6 +765,7 @@ proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string], ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. var c: TCaptures + c.origStart = start result = rawMatch(s, pattern, start, c) if result >= 0: for i in 0..c.ml-1: @@ -760,6 +778,7 @@ proc matchLen*(s: string, pattern: TPeg, ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. var c: TCaptures + c.origStart = start result = rawMatch(s, pattern, start, c) proc find*(s: string, pattern: TPeg, matches: var openarray[string], @@ -988,14 +1007,16 @@ type tkAt, ## '@' tkBuiltin, ## \identifier tkEscaped, ## \\ - tkDollar ## '$' + tkBackref, ## '$' + tkDollar, ## '$' + tkHat ## '^' TToken {.final.} = object ## a token kind: TTokKind ## the type of the token modifier: TModifier literal: string ## the parsed (string) literal charset: set[char] ## if kind == tkCharSet - index: int ## if kind == tkDollar + index: int ## if kind == tkBackref TPegLexer = object ## the lexer object. bufpos: int ## the current position within the buffer @@ -1010,7 +1031,7 @@ const "invalid", "[EOF]", ".", "_", "identifier", "string literal", "character set", "(", ")", "{", "}", "{@}", "<-", "/", "*", "+", "&", "!", "?", - "@", "built-in", "escaped", "$" + "@", "built-in", "escaped", "$", "$", "^" ] proc HandleCR(L: var TPegLexer, pos: int): int = @@ -1155,13 +1176,13 @@ proc getDollar(c: var TPegLexer, tok: var TToken) = var pos = c.bufPos + 1 var buf = c.buf if buf[pos] in {'0'..'9'}: - tok.kind = tkDollar + tok.kind = tkBackref tok.index = 0 while buf[pos] in {'0'..'9'}: tok.index = tok.index * 10 + ord(buf[pos]) - ord('0') inc(pos) else: - tok.kind = tkInvalid + tok.kind = tkDollar c.bufpos = pos proc getCharSet(c: var TPegLexer, tok: var TToken) = @@ -1280,7 +1301,8 @@ proc getTok(c: var TPegLexer, tok: var TToken) = tok.literal = "[EOF]" of 'a'..'z', 'A'..'Z', '\128'..'\255': getSymbol(c, tok) - if c.buf[c.bufpos] in {'\'', '"', '$'}: + if c.buf[c.bufpos] in {'\'', '"'} or + c.buf[c.bufpos] == '$' and c.buf[c.bufpos+1] in {'0'..'9'}: case tok.literal of "i": tok.modifier = modIgnoreCase of "y": tok.modifier = modIgnoreStyle @@ -1331,6 +1353,10 @@ proc getTok(c: var TPegLexer, tok: var TToken) = tok.kind = tkCurlyAt inc(c.bufpos) add(tok.literal, '@') + of '^': + tok.kind = tkHat + inc(c.bufpos) + add(tok.literal, '^') else: add(tok.literal, c.buf[c.bufpos]) inc(c.bufpos) @@ -1474,7 +1500,13 @@ proc primary(p: var TPegParser): TPeg = of tkEscaped: result = term(p.tok.literal[0]).token(p) getTok(p) - of tkDollar: + of tkDollar: + result = endAnchor() + getTok(p) + of tkHat: + result = startAnchor() + getTok(p) + of tkBackref: var m = p.tok.modifier if m == modNone: m = p.modifier result = modifiedBackRef(p.tok.index, m).token(p) @@ -1502,7 +1534,8 @@ proc seqExpr(p: var TPegParser): TPeg = while true: case p.tok.kind of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe, - tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkCurlyAt: + tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref, + tkHat, tkCurlyAt: result = sequence(result, primary(p)) of tkIdentifier: if not arrowIsNextTok(p): @@ -1693,3 +1726,5 @@ when isMainModule: peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") == "var1<-keykey;var2<-key2key2") + assert match("prefix/start", peg"^start$", 7) + diff --git a/lib/pure/strutils.nim b/lib/pure/strutils.nim index 76ea068df..6a7f128c5 100755 --- a/lib/pure/strutils.nim +++ b/lib/pure/strutils.nim @@ -1,7 +1,7 @@ # # # Nimrod's Runtime Library -# (c) Copyright 2010 Andreas Rumpf +# (c) Copyright 2011 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. @@ -42,6 +42,8 @@ const IdentStartChars* = {'a'..'z', 'A'..'Z', '_'} ## the set of characters an identifier can start with + NewLines* = {'\13', '\10'} + ## the set of characters a newline terminator can start with proc toLower*(c: Char): Char {.noSideEffect, procvar, rtl, extern: "nsuToLowerChar".} = diff --git a/lib/system.nim b/lib/system.nim index d632b7367..7822e6cad 100755 --- a/lib/system.nim +++ b/lib/system.nim @@ -734,16 +734,16 @@ proc compileOption*(option: string): bool {. ## can be used to determine an on|off compile-time option. Example: ## ## .. code-block:: nimrod - ## when compileOption("floatchecks"): - ## echo "compiled with floating point NaN and Inf checks" + ## when compileOption("floatchecks"): + ## echo "compiled with floating point NaN and Inf checks" proc compileOption*(option, arg: string): bool {. magic: "CompileOptionArg", noSideEffect.} ## can be used to determine an enum compile-time option. Example: ## ## .. code-block:: nimrod - ## when compileOption("opt", "size") and compileOption("gc", "boehm"): - ## echo "compiled with optimization for size and uses Boehm's GC" + ## when compileOption("opt", "size") and compileOption("gc", "boehm"): + ## echo "compiled with optimization for size and uses Boehm's GC" include "system/inclrtl" diff --git a/tools/nimgrep.nim b/tools/nimgrep.nim index cc1f89a74..8ee1b8a76 100755 --- a/tools/nimgrep.nim +++ b/tools/nimgrep.nim @@ -1,7 +1,7 @@ # # # Nimrod Grep Utility -# (c) Copyright 2010 Andreas Rumpf +# (c) Copyright 2011 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. @@ -11,21 +11,28 @@ import os, strutils, parseopt, pegs, re, terminal const - Usage = """ -Usage: nimgrep [options] [pattern] [files/directory] + Version = "0.7" + Usage = "nimgrep - Nimrod Grep Utility Version " & version & """ + + (c) 2011 Andreas Rumpf +Usage: + nimgrep [options] [pattern] [files/directory] Options: --find, -f find the pattern (default) --replace, -r replace the pattern --peg pattern is a peg (default) - --re pattern is a regular expression + --re pattern is a regular expression; extended syntax for + the regular expression is always turned on --recursive process directories recursively --confirm confirm each occurence/replacement; there is a chance - to abort any time without touching the file(s) + to abort any time without touching the file --stdin read pattern from stdin (to avoid the shell's confusing quoting rules) --word, -w the pattern should have word boundaries --ignore_case, -i be case insensitive --ignore_style, -y be style insensitive + --help, -h shows this help + --version, -v shows the version """ type @@ -48,7 +55,7 @@ proc ask(msg: string): string = proc Confirm: TConfirmEnum = while true: - case normalize(ask("[a]bort; [y]es, a[l]l, [n]o, non[e]: ")) + case normalize(ask(" [a]bort; [y]es, a[l]l, [n]o, non[e]: ")) of "a", "abort": return ceAbort of "y", "yes": return ceYes of "l", "all": return ceAll @@ -56,12 +63,7 @@ proc Confirm: TConfirmEnum = of "e", "none": return ceNone else: nil -proc highlight(a, b, c: string) = - stdout.write(a) - terminal.WriteStyled(b) - stdout.writeln(c) - -proc countLines(s: string, first = 0, last = s.high): int = +proc countLines(s: string, first, last: int): int = var i = first while i <= last: if s[i] == '\13': @@ -71,6 +73,37 @@ proc countLines(s: string, first = 0, last = s.high): int = inc result inc i +proc beforePattern(s: string, first: int): int = + result = first-1 + while result >= 0: + if s[result] in newlines: break + dec(result) + inc(result) + +proc afterPattern(s: string, last: int): int = + result = last+1 + while result < s.len: + if s[result] in newlines: break + inc(result) + dec(result) + +proc highlight(s, match, repl: string, t: tuple[first, last: int], + line: int, showRepl: bool) = + const alignment = 6 + stdout.write(line.`$`.align(alignment), ": ") + var x = beforePattern(s, t.first) + var y = afterPattern(s, t.last) + for i in x .. t.first-1: stdout.write(s[i]) + terminal.WriteStyled(match, {styleUnderscore, styleBright}) + for i in t.last+1 .. y: stdout.write(s[i]) + stdout.write("\n") + if showRepl: + stdout.write(repeatChar(alignment-1), "-> ") + for i in x .. t.first-1: stdout.write(s[i]) + terminal.WriteStyled(repl, {styleUnderscore, styleBright}) + for i in t.last+1 .. y: stdout.write(s[i]) + stdout.write("\n") + proc processFile(filename: string) = var buffer = system.readFile(filename) if isNil(buffer): quit("cannot open file: " & filename) @@ -92,53 +125,76 @@ proc processFile(filename: string) = var line = 1 var i = 0 - var matches: array[0..re.MaxSubpatterns-1. string] + var matches: array[0..re.MaxSubpatterns-1, string] + for j in 0..high(matches): matches[j] = "" var reallyReplace = true while i < buffer.len: var t: tuple[first, last: int] - if optRegex in options: - quit "to implement" - else: + if optRegex notin options: t = findBounds(buffer, pegp, matches, i) - + else: + t = findBounds(buffer, rep, matches, i) if t.first <= 0: break inc(line, countLines(buffer, i, t.first-1)) var wholeMatch = buffer.copy(t.first, t.last) - echo "line ", line, ": ", wholeMatch - if optReplace in options: - var r = replace(wholeMatch, pegp, replacement) - + if optReplace notin options: + highlight(buffer, wholeMatch, "", t, line, showRepl=false) + else: + var r: string + if optRegex notin options: + r = replace(wholeMatch, pegp, replacement % matches) + else: + r = replace(wholeMatch, rep, replacement % matches) if optConfirm in options: + highlight(buffer, wholeMatch, r, t, line, showRepl=true) case Confirm() - of ceAbort: - of ceYes: + of ceAbort: quit(0) + of ceYes: reallyReplace = true of ceAll: reallyReplace = true + options.excl(optConfirm) of ceNo: reallyReplace = false of ceNone: reallyReplace = false + options.excl(optConfirm) + else: + highlight(buffer, wholeMatch, r, t, line, showRepl=reallyReplace) if reallyReplace: - + result.add(buffer.copy(i, t.first-1)) + result.add(r) + else: + result.add(buffer.copy(i, t.last)) inc(line, countLines(buffer, t.first, t.last)) - i = t.last+1 - + if optReplace in options: + result.add(copy(buffer, i)) + var f: TFile + if open(f, filename, fmWrite): + f.write(result) + f.close() + else: + quit "cannot open file for overwriting: " & filename + proc walker(dir: string) = + var isDir = false for kind, path in walkDir(dir): + isDir = true case kind - of pcFile: processFile(path) - of pcDirectory: + of pcFile: + processFile(path) + of pcDir: if optRecursive in options: walker(path) else: nil + if not isDir: processFile(dir) proc writeHelp() = quit(Usage) -proc writeVersion() = quit("1.0") +proc writeVersion() = quit(Version) proc checkOptions(subset: TOptions, a, b: string) = if subset <= options: @@ -187,5 +243,17 @@ if pattern.len == 0: writeHelp() else: if filename.len == 0: filename = os.getCurrentDir() + if optRegex notin options: + if optIgnoreStyle in options: + pattern = "\\y " & pattern + elif optIgnoreCase in options: + pattern = "\\i " & pattern + if optWord in options: + pattern = r"(&\letter? / ^ )(" & pattern & r") !\letter" + else: + if optIgnoreStyle in options: + quit "ignorestyle not supported for regular expressions" + if optWord in options: + pattern = r"\b (:?" & pattern & r") \b" walker(filename) diff --git a/web/news.txt b/web/news.txt index 0a8f01cad..36deb92df 100755 --- a/web/news.txt +++ b/web/news.txt @@ -42,6 +42,7 @@ Additions - Pegs support new built-ins: ``\letter``, ``\upper``, ``\lower``, ``\title``, ``\white``. - Pegs support the new built-in ``\skip`` operation. +- Pegs support the ``$`` and ``^`` anchors. - Source code filters are now documented. - Added ``emit`` pragma for direct code generator control. - Additional operations were added to the ``complex`` module. |