diff options
author | Araq <rumpf_a@web.de> | 2011-02-02 00:05:55 +0100 |
---|---|---|
committer | Araq <rumpf_a@web.de> | 2011-02-02 00:05:55 +0100 |
commit | 437eb2c37ec9b3829301e27ea6bce7d70ad5eb02 (patch) | |
tree | 67a9e5983c4aa70b70bc4f9680c25d1c9ceb54c4 /lib/pure/pegs.nim | |
parent | 9387913b734e1a8f4c90acdfcbdc7f92a9892244 (diff) | |
download | Nim-437eb2c37ec9b3829301e27ea6bce7d70ad5eb02.tar.gz |
nimgrep: first working version
Diffstat (limited to 'lib/pure/pegs.nim')
-rwxr-xr-x | lib/pure/pegs.nim | 55 |
1 files changed, 45 insertions, 10 deletions
diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim index 4628a3ff9..de968bff4 100755 --- a/lib/pure/pegs.nim +++ b/lib/pure/pegs.nim @@ -1,7 +1,7 @@ # # # Nimrod's Runtime Library -# (c) Copyright 2010 Andreas Rumpf +# (c) Copyright 2011 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. @@ -65,7 +65,8 @@ type pkSearch, ## @a --> Internal DSL: @a pkCapturedSearch, ## {@} a --> Internal DSL: @@a pkRule, ## a <- b - pkList ## a, b + pkList, ## a, b + pkStartAnchor ## ^ --> Internal DSL: startAnchor() TNonTerminalFlag = enum ntDeclared, ntUsed TNonTerminal {.final.} = object ## represents a non terminal symbol @@ -264,6 +265,14 @@ proc UnicodeWhitespace*: TPeg {.inline.} = ## whitespace character. result.kind = pkWhitespace +proc startAnchor*: TPeg {.inline.} = + ## constructs the PEG ``^`` which matches the start of the input. + result.kind = pkStartAnchor + +proc endAnchor*: TPeg {.inline.} = + ## constructs the PEG ``$`` which matches the end of the input. + result = !any() + proc capture*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsCapture".} = ## constructs a capture with the PEG `a` result.kind = pkCapture @@ -484,6 +493,8 @@ proc toStrAux(r: TPeg, res: var string) = for i in 0 .. high(r.sons): toStrAux(r.sons[i], res) add(res, "\n") + of pkStartAnchor: + add(res, '^') proc `$` *(r: TPeg): string {.nosideEffect, rtl, extern: "npegsToString".} = ## converts a PEG to its string representation @@ -496,6 +507,7 @@ type TCaptures* {.final.} = object ## contains the captured substrings. matches: array[0..maxSubpatterns-1, tuple[first, last: int]] ml: int + origStart: int proc bounds*(c: TCaptures, i: range[0..maxSubpatterns-1]): tuple[first, last: int] = @@ -721,6 +733,9 @@ proc rawMatch*(s: string, p: TPeg, start: int, c: var TCaptures): int {. n.kind = succ(pkTerminal, ord(p.kind)-ord(pkBackRef)) n.term = s.copy(a, b) result = rawMatch(s, n, start, c) + of pkStartAnchor: + if c.origStart == start: result = 0 + else: result = -1 of pkRule, pkList: assert false proc match*(s: string, pattern: TPeg, matches: var openarray[string], @@ -730,6 +745,7 @@ proc match*(s: string, pattern: TPeg, matches: var openarray[string], ## match, nothing is written into ``matches`` and ``false`` is ## returned. var c: TCaptures + c.origStart = start result = rawMatch(s, pattern, start, c) == len(s) -start if result: for i in 0..c.ml-1: @@ -739,6 +755,7 @@ proc match*(s: string, pattern: TPeg, start = 0): bool {.nosideEffect, rtl, extern: "npegs$1".} = ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``. var c: TCaptures + c.origStart = start result = rawMatch(s, pattern, start, c) == len(s)-start proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string], @@ -748,6 +765,7 @@ proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string], ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. var c: TCaptures + c.origStart = start result = rawMatch(s, pattern, start, c) if result >= 0: for i in 0..c.ml-1: @@ -760,6 +778,7 @@ proc matchLen*(s: string, pattern: TPeg, ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. var c: TCaptures + c.origStart = start result = rawMatch(s, pattern, start, c) proc find*(s: string, pattern: TPeg, matches: var openarray[string], @@ -988,14 +1007,16 @@ type tkAt, ## '@' tkBuiltin, ## \identifier tkEscaped, ## \\ - tkDollar ## '$' + tkBackref, ## '$' + tkDollar, ## '$' + tkHat ## '^' TToken {.final.} = object ## a token kind: TTokKind ## the type of the token modifier: TModifier literal: string ## the parsed (string) literal charset: set[char] ## if kind == tkCharSet - index: int ## if kind == tkDollar + index: int ## if kind == tkBackref TPegLexer = object ## the lexer object. bufpos: int ## the current position within the buffer @@ -1010,7 +1031,7 @@ const "invalid", "[EOF]", ".", "_", "identifier", "string literal", "character set", "(", ")", "{", "}", "{@}", "<-", "/", "*", "+", "&", "!", "?", - "@", "built-in", "escaped", "$" + "@", "built-in", "escaped", "$", "$", "^" ] proc HandleCR(L: var TPegLexer, pos: int): int = @@ -1155,13 +1176,13 @@ proc getDollar(c: var TPegLexer, tok: var TToken) = var pos = c.bufPos + 1 var buf = c.buf if buf[pos] in {'0'..'9'}: - tok.kind = tkDollar + tok.kind = tkBackref tok.index = 0 while buf[pos] in {'0'..'9'}: tok.index = tok.index * 10 + ord(buf[pos]) - ord('0') inc(pos) else: - tok.kind = tkInvalid + tok.kind = tkDollar c.bufpos = pos proc getCharSet(c: var TPegLexer, tok: var TToken) = @@ -1280,7 +1301,8 @@ proc getTok(c: var TPegLexer, tok: var TToken) = tok.literal = "[EOF]" of 'a'..'z', 'A'..'Z', '\128'..'\255': getSymbol(c, tok) - if c.buf[c.bufpos] in {'\'', '"', '$'}: + if c.buf[c.bufpos] in {'\'', '"'} or + c.buf[c.bufpos] == '$' and c.buf[c.bufpos+1] in {'0'..'9'}: case tok.literal of "i": tok.modifier = modIgnoreCase of "y": tok.modifier = modIgnoreStyle @@ -1331,6 +1353,10 @@ proc getTok(c: var TPegLexer, tok: var TToken) = tok.kind = tkCurlyAt inc(c.bufpos) add(tok.literal, '@') + of '^': + tok.kind = tkHat + inc(c.bufpos) + add(tok.literal, '^') else: add(tok.literal, c.buf[c.bufpos]) inc(c.bufpos) @@ -1474,7 +1500,13 @@ proc primary(p: var TPegParser): TPeg = of tkEscaped: result = term(p.tok.literal[0]).token(p) getTok(p) - of tkDollar: + of tkDollar: + result = endAnchor() + getTok(p) + of tkHat: + result = startAnchor() + getTok(p) + of tkBackref: var m = p.tok.modifier if m == modNone: m = p.modifier result = modifiedBackRef(p.tok.index, m).token(p) @@ -1502,7 +1534,8 @@ proc seqExpr(p: var TPegParser): TPeg = while true: case p.tok.kind of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe, - tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkCurlyAt: + tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref, + tkHat, tkCurlyAt: result = sequence(result, primary(p)) of tkIdentifier: if not arrowIsNextTok(p): @@ -1693,3 +1726,5 @@ when isMainModule: peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") == "var1<-keykey;var2<-key2key2") + assert match("prefix/start", peg"^start$", 7) + |