diff options
Diffstat (limited to 'lib/pure/pegs.nim')
-rw-r--r-- | lib/pure/pegs.nim | 720 |
1 files changed, 304 insertions, 416 deletions
diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim index 8abd3deb5..2969fd6d7 100644 --- a/lib/pure/pegs.nim +++ b/lib/pure/pegs.nim @@ -16,14 +16,17 @@ ## include "system/inclrtl" +when defined(nimPreviewSlimSystem): + import std/[syncio, assertions] const useUnicode = true ## change this to deactivate proper UTF-8 support -import strutils, macros +import std/[strutils, macros] +import std/private/decode_helpers when useUnicode: - import unicode + import std/unicode export unicode.`==` const @@ -82,30 +85,30 @@ type of pkChar, pkGreedyRepChar: ch: char of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char] of pkNonTerminal: nt: NonTerminal - of pkBackRef..pkBackRefIgnoreStyle: index: range[0..MaxSubpatterns] + of pkBackRef..pkBackRefIgnoreStyle: index: range[-MaxSubpatterns..MaxSubpatterns-1] else: sons: seq[Peg] NonTerminal* = ref NonTerminalObj -proc kind*(p: Peg): PegKind = p.kind +func kind*(p: Peg): PegKind = p.kind ## Returns the *PegKind* of a given *Peg* object. -proc term*(p: Peg): string = p.term +func term*(p: Peg): string = p.term ## Returns the *string* representation of a given *Peg* variant object ## where present. -proc ch*(p: Peg): char = p.ch +func ch*(p: Peg): char = p.ch ## Returns the *char* representation of a given *Peg* variant object ## where present. -proc charChoice*(p: Peg): ref set[char] = p.charChoice +func charChoice*(p: Peg): ref set[char] = p.charChoice ## Returns the *charChoice* field of a given *Peg* variant object ## where present. -proc nt*(p: Peg): NonTerminal = p.nt +func nt*(p: Peg): NonTerminal = p.nt ## Returns the *NonTerminal* object of a given *Peg* variant object ## where present. -proc index*(p: Peg): range[0..MaxSubpatterns] = p.index +func index*(p: Peg): range[-MaxSubpatterns..MaxSubpatterns-1] = p.index ## Returns the back-reference index of a captured sub-pattern in the ## *Captures* object for a given *Peg* variant object where present. @@ -119,59 +122,60 @@ iterator pairs*(p: Peg): (int, Peg) {.inline.} = for i in 0 ..< p.sons.len: yield (i, p.sons[i]) -proc name*(nt: NonTerminal): string = nt.name +func name*(nt: NonTerminal): string = nt.name ## Gets the name of the symbol represented by the parent *Peg* object variant ## of a given *NonTerminal*. -proc line*(nt: NonTerminal): int = nt.line +func line*(nt: NonTerminal): int = nt.line ## Gets the line number of the definition of the parent *Peg* object variant ## of a given *NonTerminal*. -proc col*(nt: NonTerminal): int = nt.col +func col*(nt: NonTerminal): int = nt.col ## Gets the column number of the definition of the parent *Peg* object variant ## of a given *NonTerminal*. -proc flags*(nt: NonTerminal): set[NonTerminalFlag] = nt.flags +func flags*(nt: NonTerminal): set[NonTerminalFlag] = nt.flags ## Gets the *NonTerminalFlag*-typed flags field of the parent *Peg* variant ## object of a given *NonTerminal*. -proc rule*(nt: NonTerminal): Peg = nt.rule +func rule*(nt: NonTerminal): Peg = nt.rule ## Gets the *Peg* object representing the rule definition of the parent *Peg* ## object variant of a given *NonTerminal*. -proc term*(t: string): Peg {.noSideEffect, rtl, extern: "npegs$1Str".} = +func term*(t: string): Peg {.rtl, extern: "npegs$1Str".} = ## constructs a PEG from a terminal string if t.len != 1: result = Peg(kind: pkTerminal, term: t) else: result = Peg(kind: pkChar, ch: t[0]) -proc termIgnoreCase*(t: string): Peg {. - noSideEffect, rtl, extern: "npegs$1".} = +func termIgnoreCase*(t: string): Peg {. + rtl, extern: "npegs$1".} = ## constructs a PEG from a terminal string; ignore case for matching result = Peg(kind: pkTerminalIgnoreCase, term: t) -proc termIgnoreStyle*(t: string): Peg {. - noSideEffect, rtl, extern: "npegs$1".} = +func termIgnoreStyle*(t: string): Peg {. + rtl, extern: "npegs$1".} = ## constructs a PEG from a terminal string; ignore style for matching result = Peg(kind: pkTerminalIgnoreStyle, term: t) -proc term*(t: char): Peg {.noSideEffect, rtl, extern: "npegs$1Char".} = +func term*(t: char): Peg {.rtl, extern: "npegs$1Char".} = ## constructs a PEG from a terminal char assert t != '\0' result = Peg(kind: pkChar, ch: t) -proc charSet*(s: set[char]): Peg {.noSideEffect, rtl, extern: "npegs$1".} = +func charSet*(s: set[char]): Peg {.rtl, extern: "npegs$1".} = ## constructs a PEG from a character set `s` assert '\0' notin s result = Peg(kind: pkCharChoice) - new(result.charChoice) - result.charChoice[] = s + {.cast(noSideEffect).}: + new(result.charChoice) + result.charChoice[] = s -proc len(a: Peg): int {.inline.} = return a.sons.len -proc add(d: var Peg, s: Peg) {.inline.} = add(d.sons, s) +func len(a: Peg): int {.inline.} = return a.sons.len +func add(d: var Peg, s: Peg) {.inline.} = add(d.sons, s) -proc addChoice(dest: var Peg, elem: Peg) = +func addChoice(dest: var Peg, elem: Peg) = var L = dest.len-1 if L >= 0 and dest.sons[L].kind == pkCharChoice: # caution! Do not introduce false aliasing here! @@ -194,12 +198,12 @@ template multipleOp(k: PegKind, localOpt: untyped) = if result.len == 1: result = result.sons[0] -proc `/`*(a: varargs[Peg]): Peg {. - noSideEffect, rtl, extern: "npegsOrderedChoice".} = +func `/`*(a: varargs[Peg]): Peg {. + rtl, extern: "npegsOrderedChoice".} = ## constructs an ordered choice with the PEGs in `a` multipleOp(pkOrderedChoice, addChoice) -proc addSequence(dest: var Peg, elem: Peg) = +func addSequence(dest: var Peg, elem: Peg) = var L = dest.len-1 if L >= 0 and dest.sons[L].kind == pkTerminal: # caution! Do not introduce false aliasing here! @@ -211,12 +215,12 @@ proc addSequence(dest: var Peg, elem: Peg) = else: add(dest, elem) else: add(dest, elem) -proc sequence*(a: varargs[Peg]): Peg {. - noSideEffect, rtl, extern: "npegs$1".} = +func sequence*(a: varargs[Peg]): Peg {. + rtl, extern: "npegs$1".} = ## constructs a sequence with all the PEGs from `a` multipleOp(pkSequence, addSequence) -proc `?`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsOptional".} = +func `?`*(a: Peg): Peg {.rtl, extern: "npegsOptional".} = ## constructs an optional for the PEG `a` if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar, pkGreedyRepSet}: @@ -226,7 +230,7 @@ proc `?`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsOptional".} = else: result = Peg(kind: pkOption, sons: @[a]) -proc `*`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsGreedyRep".} = +func `*`*(a: Peg): Peg {.rtl, extern: "npegsGreedyRep".} = ## constructs a "greedy repetition" for the PEG `a` case a.kind of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption: @@ -241,96 +245,99 @@ proc `*`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsGreedyRep".} = else: result = Peg(kind: pkGreedyRep, sons: @[a]) -proc `!*`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsSearch".} = +func `!*`*(a: Peg): Peg {.rtl, extern: "npegsSearch".} = ## constructs a "search" for the PEG `a` result = Peg(kind: pkSearch, sons: @[a]) -proc `!*\`*(a: Peg): Peg {.noSideEffect, rtl, +func `!*\`*(a: Peg): Peg {.rtl, extern: "npgegsCapturedSearch".} = ## constructs a "captured search" for the PEG `a` result = Peg(kind: pkCapturedSearch, sons: @[a]) -proc `+`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsGreedyPosRep".} = +func `+`*(a: Peg): Peg {.rtl, extern: "npegsGreedyPosRep".} = ## constructs a "greedy positive repetition" with the PEG `a` return sequence(a, *a) -proc `&`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsAndPredicate".} = +func `&`*(a: Peg): Peg {.rtl, extern: "npegsAndPredicate".} = ## constructs an "and predicate" with the PEG `a` result = Peg(kind: pkAndPredicate, sons: @[a]) -proc `!`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsNotPredicate".} = +func `!`*(a: Peg): Peg {.rtl, extern: "npegsNotPredicate".} = ## constructs a "not predicate" with the PEG `a` result = Peg(kind: pkNotPredicate, sons: @[a]) -proc any*: Peg {.inline.} = +func any*: Peg {.inline.} = ## constructs the PEG `any character`:idx: (``.``) result = Peg(kind: pkAny) -proc anyRune*: Peg {.inline.} = +func anyRune*: Peg {.inline.} = ## constructs the PEG `any rune`:idx: (``_``) result = Peg(kind: pkAnyRune) -proc newLine*: Peg {.inline.} = +func newLine*: Peg {.inline.} = ## constructs the PEG `newline`:idx: (``\n``) result = Peg(kind: pkNewLine) -proc unicodeLetter*: Peg {.inline.} = +func unicodeLetter*: Peg {.inline.} = ## constructs the PEG ``\letter`` which matches any Unicode letter. result = Peg(kind: pkLetter) -proc unicodeLower*: Peg {.inline.} = +func unicodeLower*: Peg {.inline.} = ## constructs the PEG ``\lower`` which matches any Unicode lowercase letter. result = Peg(kind: pkLower) -proc unicodeUpper*: Peg {.inline.} = +func unicodeUpper*: Peg {.inline.} = ## constructs the PEG ``\upper`` which matches any Unicode uppercase letter. result = Peg(kind: pkUpper) -proc unicodeTitle*: Peg {.inline.} = +func unicodeTitle*: Peg {.inline.} = ## constructs the PEG ``\title`` which matches any Unicode title letter. result = Peg(kind: pkTitle) -proc unicodeWhitespace*: Peg {.inline.} = +func unicodeWhitespace*: Peg {.inline.} = ## constructs the PEG ``\white`` which matches any Unicode ## whitespace character. result = Peg(kind: pkWhitespace) -proc startAnchor*: Peg {.inline.} = +func startAnchor*: Peg {.inline.} = ## constructs the PEG ``^`` which matches the start of the input. result = Peg(kind: pkStartAnchor) -proc endAnchor*: Peg {.inline.} = +func endAnchor*: Peg {.inline.} = ## constructs the PEG ``$`` which matches the end of the input. result = !any() -proc capture*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsCapture".} = +func capture*(a: Peg = Peg(kind: pkEmpty)): Peg {.rtl, extern: "npegsCapture".} = ## constructs a capture with the PEG `a` result = Peg(kind: pkCapture, sons: @[a]) -proc backref*(index: range[1..MaxSubpatterns]): Peg {. - noSideEffect, rtl, extern: "npegs$1".} = +func backref*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {. + rtl, extern: "npegs$1".} = ## constructs a back reference of the given `index`. `index` starts counting - ## from 1. - result = Peg(kind: pkBackRef, index: index-1) + ## from 1. `reverse` specifies whether indexing starts from the end of the + ## capture list. + result = Peg(kind: pkBackRef, index: (if reverse: -index else: index - 1)) -proc backrefIgnoreCase*(index: range[1..MaxSubpatterns]): Peg {. - noSideEffect, rtl, extern: "npegs$1".} = +func backrefIgnoreCase*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {. + rtl, extern: "npegs$1".} = ## constructs a back reference of the given `index`. `index` starts counting - ## from 1. Ignores case for matching. - result = Peg(kind: pkBackRefIgnoreCase, index: index-1) + ## from 1. `reverse` specifies whether indexing starts from the end of the + ## capture list. Ignores case for matching. + result = Peg(kind: pkBackRefIgnoreCase, index: (if reverse: -index else: index - 1)) -proc backrefIgnoreStyle*(index: range[1..MaxSubpatterns]): Peg {. - noSideEffect, rtl, extern: "npegs$1".} = +func backrefIgnoreStyle*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {. + rtl, extern: "npegs$1".} = ## constructs a back reference of the given `index`. `index` starts counting - ## from 1. Ignores style for matching. - result = Peg(kind: pkBackRefIgnoreStyle, index: index-1) + ## from 1. `reverse` specifies whether indexing starts from the end of the + ## capture list. Ignores style for matching. + result = Peg(kind: pkBackRefIgnoreStyle, index: (if reverse: -index else: index - 1)) -proc spaceCost(n: Peg): int = +func spaceCost(n: Peg): int = case n.kind of pkEmpty: discard of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, pkGreedyRepChar, pkCharChoice, pkGreedyRepSet, - pkAny..pkWhitespace, pkGreedyAny: + pkAny..pkWhitespace, pkGreedyAny, pkBackRef..pkBackRefIgnoreStyle: result = 1 of pkNonTerminal: # we cannot inline a rule with a non-terminal @@ -340,8 +347,8 @@ proc spaceCost(n: Peg): int = inc(result, spaceCost(n.sons[i])) if result >= InlineThreshold: break -proc nonterminal*(n: NonTerminal): Peg {. - noSideEffect, rtl, extern: "npegs$1".} = +func nonterminal*(n: NonTerminal): Peg {. + rtl, extern: "npegs$1".} = ## constructs a PEG that consists of the nonterminal symbol assert n != nil if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold: @@ -350,8 +357,8 @@ proc nonterminal*(n: NonTerminal): Peg {. else: result = Peg(kind: pkNonTerminal, nt: n) -proc newNonTerminal*(name: string, line, column: int): NonTerminal {. - noSideEffect, rtl, extern: "npegs$1".} = +func newNonTerminal*(name: string, line, column: int): NonTerminal {. + rtl, extern: "npegs$1".} = ## constructs a nonterminal symbol result = NonTerminal(name: name, line: line, col: column) @@ -386,7 +393,7 @@ template natural*: Peg = # ------------------------- debugging ----------------------------------------- -proc esc(c: char, reserved = {'\0'..'\255'}): string = +func esc(c: char, reserved = {'\0'..'\255'}): string = case c of '\b': result = "\\b" of '\t': result = "\\t" @@ -402,14 +409,14 @@ proc esc(c: char, reserved = {'\0'..'\255'}): string = elif c in reserved: result = '\\' & c else: result = $c -proc singleQuoteEsc(c: char): string = return "'" & esc(c, {'\''}) & "'" +func singleQuoteEsc(c: char): string = return "'" & esc(c, {'\''}) & "'" -proc singleQuoteEsc(str: string): string = +func singleQuoteEsc(str: string): string = result = "'" for c in items(str): add result, esc(c, {'\''}) add result, '\'' -proc charSetEscAux(cc: set[char]): string = +func charSetEscAux(cc: set[char]): string = const reserved = {'^', '-', ']'} result = "" var c1 = 0 @@ -426,13 +433,13 @@ proc charSetEscAux(cc: set[char]): string = c1 = c2 inc(c1) -proc charSetEsc(cc: set[char]): string = +func charSetEsc(cc: set[char]): string = if card(cc) >= 128+64: result = "[^" & charSetEscAux({'\1'..'\xFF'} - cc) & ']' else: result = '[' & charSetEscAux(cc) & ']' -proc toStrAux(r: Peg, res: var string) = +func toStrAux(r: Peg, res: var string) = case r.kind of pkEmpty: add(res, "()") of pkAny: add(res, '.') @@ -518,7 +525,7 @@ proc toStrAux(r: Peg, res: var string) = of pkStartAnchor: add(res, '^') -proc `$` *(r: Peg): string {.noSideEffect, rtl, extern: "npegsToString".} = +func `$` *(r: Peg): string {.rtl, extern: "npegsToString".} = ## converts a PEG to its string representation result = "" toStrAux(r, result) @@ -531,7 +538,7 @@ type ml: int origStart: int -proc bounds*(c: Captures, +func bounds*(c: Captures, i: range[0..MaxSubpatterns-1]): tuple[first, last: int] = ## returns the bounds ``[first..last]`` of the `i`'th capture. result = c.matches[i] @@ -544,24 +551,26 @@ when not useUnicode: inc(i) template runeLenAt(s, i): untyped = 1 - proc isAlpha(a: char): bool {.inline.} = return a in {'a'..'z', 'A'..'Z'} - proc isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'} - proc isLower(a: char): bool {.inline.} = return a in {'a'..'z'} - proc isTitle(a: char): bool {.inline.} = return false - proc isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'} + func isAlpha(a: char): bool {.inline.} = return a in {'a'..'z', 'A'..'Z'} + func isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'} + func isLower(a: char): bool {.inline.} = return a in {'a'..'z'} + func isTitle(a: char): bool {.inline.} = return false + func isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'} template matchOrParse(mopProc: untyped) = # Used to make the main matcher proc *rawMatch* as well as event parser # procs. For the former, *enter* and *leave* event handler code generators # are provided which just return *discard*. - proc mopProc(s: string, p: Peg, start: int, c: var Captures): int = + proc mopProc(s: string, p: Peg, start: int, c: var Captures): int {.gcsafe, raises: [].} = proc matchBackRef(s: string, p: Peg, start: int, c: var Captures): int = # Parse handler code must run in an *of* clause of its own for each # *PegKind*, so we encapsulate the identical clause body for # *pkBackRef..pkBackRefIgnoreStyle* here. - if p.index >= c.ml: return -1 - var (a, b) = c.matches[p.index] + var index = p.index + if index < 0: index.inc(c.ml) + if index < 0 or index >= c.ml: return -1 + var (a, b) = c.matches[index] var n: Peg case p.kind of pkBackRef: @@ -821,15 +830,22 @@ template matchOrParse(mopProc: untyped) = leave(pkNotPredicate, s, p, start, result) of pkCapture: enter(pkCapture, s, p, start) - var idx = c.ml # reserve a slot for the subpattern - inc(c.ml) - result = mopProc(s, p.sons[0], start, c) - if result >= 0: - if idx < MaxSubpatterns: - c.matches[idx] = (start, start+result-1) - #else: silently ignore the capture + if p.sons.len == 0 or p.sons[0].kind == pkEmpty: + # empty capture removes last match + dec(c.ml) + c.matches[c.ml] = (0, 0) + result = 0 # match of length 0 else: - c.ml = idx + var idx = c.ml # reserve a slot for the subpattern + result = mopProc(s, p.sons[0], start, c) + if result >= 0: + if idx < MaxSubpatterns: + if idx != c.ml: + for i in countdown(c.ml, idx): + c.matches[i+1] = c.matches[i] + c.matches[idx] = (start, start+result-1) + #else: silently ignore the capture + inc(c.ml) leave(pkCapture, s, p, start, result) of pkBackRef: enter(pkBackRef, s, p, start) @@ -850,8 +866,8 @@ template matchOrParse(mopProc: untyped) = leave(pkStartAnchor, s, p, start, result) of pkRule, pkList: assert false -proc rawMatch*(s: string, p: Peg, start: int, c: var Captures): int - {.noSideEffect, rtl, extern: "npegs$1".} = +func rawMatch*(s: string, p: Peg, start: int, c: var Captures): int + {.rtl, extern: "npegs$1".} = ## low-level matching proc that implements the PEG interpreter. Use this ## for maximum efficiency (every other PEG operation ends up calling this ## proc). @@ -863,13 +879,17 @@ proc rawMatch*(s: string, p: Peg, start: int, c: var Captures): int template leave(pk, s, p, start, length) = discard matchOrParse(matchIt) - result = matchIt(s, p, start, c) + {.cast(noSideEffect).}: + # This cast is allowed because the `matchOrParse` template is used for + # both matching and parsing, but side effects are only possible when it's + # used by `eventParser`. + result = matchIt(s, p, start, c) macro mkHandlerTplts(handlers: untyped): untyped = # Transforms the handler spec in *handlers* into handler templates. # The AST structure of *handlers[0]*: # - # .. code-block:: + # ``` # StmtList # Call # Ident "pkNonTerminal" @@ -890,7 +910,8 @@ macro mkHandlerTplts(handlers: untyped): untyped = # StmtList # <handler code block> # ... - proc mkEnter(hdName, body: NimNode): NimNode = + # ``` + func mkEnter(hdName, body: NimNode): NimNode = template helper(hdName, body) {.dirty.} = template hdName(s, p, start) = let s {.inject.} = s @@ -939,60 +960,61 @@ template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) = ## match, else the length of the total match. The following example code ## evaluates an arithmetic expression defined by a simple PEG: ## - ## .. code-block:: nim - ## import strutils, pegs + ## ```nim + ## import std/[strutils, pegs] ## - ## let - ## pegAst = """ - ## Expr <- Sum - ## Sum <- Product (('+' / '-')Product)* - ## Product <- Value (('*' / '/')Value)* - ## Value <- [0-9]+ / '(' Expr ')' - ## """.peg - ## txt = "(5+3)/2-7*22" + ## let + ## pegAst = """ + ## Expr <- Sum + ## Sum <- Product (('+' / '-')Product)* + ## Product <- Value (('*' / '/')Value)* + ## Value <- [0-9]+ / '(' Expr ')' + ## """.peg + ## txt = "(5+3)/2-7*22" ## - ## var - ## pStack: seq[string] = @[] - ## valStack: seq[float] = @[] - ## opStack = "" - ## let - ## parseArithExpr = pegAst.eventParser: - ## pkNonTerminal: - ## enter: - ## pStack.add p.nt.name - ## leave: - ## pStack.setLen pStack.high - ## if length > 0: - ## let matchStr = s.substr(start, start+length-1) - ## case p.nt.name - ## of "Value": - ## try: - ## valStack.add matchStr.parseFloat - ## echo valStack - ## except ValueError: - ## discard - ## of "Sum", "Product": - ## try: - ## let val = matchStr.parseFloat - ## except ValueError: - ## if valStack.len > 1 and opStack.len > 0: - ## valStack[^2] = case opStack[^1] - ## of '+': valStack[^2] + valStack[^1] - ## of '-': valStack[^2] - valStack[^1] - ## of '*': valStack[^2] * valStack[^1] - ## else: valStack[^2] / valStack[^1] - ## valStack.setLen valStack.high - ## echo valStack - ## opStack.setLen opStack.high - ## echo opStack - ## pkChar: - ## leave: - ## if length == 1 and "Value" != pStack[^1]: - ## let matchChar = s[start] - ## opStack.add matchChar - ## echo opStack + ## var + ## pStack: seq[string] = @[] + ## valStack: seq[float] = @[] + ## opStack = "" + ## let + ## parseArithExpr = pegAst.eventParser: + ## pkNonTerminal: + ## enter: + ## pStack.add p.nt.name + ## leave: + ## pStack.setLen pStack.high + ## if length > 0: + ## let matchStr = s.substr(start, start+length-1) + ## case p.nt.name + ## of "Value": + ## try: + ## valStack.add matchStr.parseFloat + ## echo valStack + ## except ValueError: + ## discard + ## of "Sum", "Product": + ## try: + ## let val = matchStr.parseFloat + ## except ValueError: + ## if valStack.len > 1 and opStack.len > 0: + ## valStack[^2] = case opStack[^1] + ## of '+': valStack[^2] + valStack[^1] + ## of '-': valStack[^2] - valStack[^1] + ## of '*': valStack[^2] * valStack[^1] + ## else: valStack[^2] / valStack[^1] + ## valStack.setLen valStack.high + ## echo valStack + ## opStack.setLen opStack.high + ## echo opStack + ## pkChar: + ## leave: + ## if length == 1 and "Value" != pStack[^1]: + ## let matchChar = s[start] + ## opStack.add matchChar + ## echo opStack ## - ## let pLen = parseArithExpr(txt) + ## let pLen = parseArithExpr(txt) + ## ``` ## ## The *handlers* parameter consists of code blocks for *PegKinds*, ## which define the grammar elements of interest. Each block can contain @@ -1007,7 +1029,7 @@ template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) = ## Symbols declared in an *enter* handler can be made visible in the ## corresponding *leave* handler by annotating them with an *inject* pragma. proc rawParse(s: string, p: Peg, start: int, c: var Captures): int - {.genSym.} = + {.gensym.} = # binding from *macros* bind strVal @@ -1022,10 +1044,10 @@ template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) = # by *mkHandlerTplts*. template mkDoEnter(hdPostf, s, pegNode, start) = when declared(`enter hdPostf`): - `enter hdPostf`(s, pegNode, start): + `enter hdPostf`(s, pegNode, start) else: discard - let hdPostf = ident(substr(strVal(pegKind), 2)) + let hdPostf = ident(substr($pegKind, 2)) getAst(mkDoEnter(hdPostf, s, pegNode, start)) macro leave(pegKind, s, pegNode, start, length: untyped): untyped = @@ -1033,16 +1055,16 @@ template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) = # a grammar element of kind *pegKind*. template mkDoLeave(hdPostf, s, pegNode, start, length) = when declared(`leave hdPostf`): - `leave hdPostf`(s, pegNode, start, length): + `leave hdPostf`(s, pegNode, start, length) else: discard - let hdPostf = ident(substr(strVal(pegKind), 2)) + let hdPostf = ident(substr($pegKind, 2)) getAst(mkDoLeave(hdPostf, s, pegNode, start, length)) matchOrParse(parseIt) parseIt(s, p, start, c) - proc parser(s: string): int {.genSym.} = + proc parser(s: string): int {.gensym.} = # the proc to be returned var ms: array[MaxSubpatterns, (int, int)] @@ -1059,8 +1081,8 @@ template fillMatches(s, caps, c) = else: caps[k] = "" -proc matchLen*(s: string, pattern: Peg, matches: var openArray[string], - start = 0): int {.noSideEffect, rtl, extern: "npegs$1Capture".} = +func matchLen*(s: string, pattern: Peg, matches: var openArray[string], + start = 0): int {.rtl, extern: "npegs$1Capture".} = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. It's possible that a suffix of `s` remains @@ -1070,8 +1092,8 @@ proc matchLen*(s: string, pattern: Peg, matches: var openArray[string], result = rawMatch(s, pattern, start, c) if result >= 0: fillMatches(s, matches, c) -proc matchLen*(s: string, pattern: Peg, - start = 0): int {.noSideEffect, rtl, extern: "npegs$1".} = +func matchLen*(s: string, pattern: Peg, + start = 0): int {.rtl, extern: "npegs$1".} = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. It's possible that a suffix of `s` remains @@ -1080,22 +1102,22 @@ proc matchLen*(s: string, pattern: Peg, c.origStart = start result = rawMatch(s, pattern, start, c) -proc match*(s: string, pattern: Peg, matches: var openArray[string], - start = 0): bool {.noSideEffect, rtl, extern: "npegs$1Capture".} = +func match*(s: string, pattern: Peg, matches: var openArray[string], + start = 0): bool {.rtl, extern: "npegs$1Capture".} = ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and ## the captured substrings in the array ``matches``. If it does not ## match, nothing is written into ``matches`` and ``false`` is ## returned. result = matchLen(s, pattern, matches, start) != -1 -proc match*(s: string, pattern: Peg, - start = 0): bool {.noSideEffect, rtl, extern: "npegs$1".} = +func match*(s: string, pattern: Peg, + start = 0): bool {.rtl, extern: "npegs$1".} = ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``. result = matchLen(s, pattern, start) != -1 -proc find*(s: string, pattern: Peg, matches: var openArray[string], - start = 0): int {.noSideEffect, rtl, extern: "npegs$1Capture".} = +func find*(s: string, pattern: Peg, matches: var openArray[string], + start = 0): int {.rtl, extern: "npegs$1Capture".} = ## returns the starting position of ``pattern`` in ``s`` and the captured ## substrings in the array ``matches``. If it does not match, nothing ## is written into ``matches`` and -1 is returned. @@ -1109,9 +1131,9 @@ proc find*(s: string, pattern: Peg, matches: var openArray[string], return -1 # could also use the pattern here: (!P .)* P -proc findBounds*(s: string, pattern: Peg, matches: var openArray[string], +func findBounds*(s: string, pattern: Peg, matches: var openArray[string], start = 0): tuple[first, last: int] {. - noSideEffect, rtl, extern: "npegs$1Capture".} = + rtl, extern: "npegs$1Capture".} = ## returns the starting position and end position of ``pattern`` in ``s`` ## and the captured ## substrings in the array ``matches``. If it does not match, nothing @@ -1126,8 +1148,8 @@ proc findBounds*(s: string, pattern: Peg, matches: var openArray[string], return (i, i+L-1) return (-1, 0) -proc find*(s: string, pattern: Peg, - start = 0): int {.noSideEffect, rtl, extern: "npegs$1".} = +func find*(s: string, pattern: Peg, + start = 0): int {.rtl, extern: "npegs$1".} = ## returns the starting position of ``pattern`` in ``s``. If it does not ## match, -1 is returned. var c: Captures @@ -1150,22 +1172,18 @@ iterator findAll*(s: string, pattern: Peg, start = 0): string = yield substr(s, i, i+L-1) inc(i, L) -proc findAll*(s: string, pattern: Peg, start = 0): seq[string] {. - noSideEffect, rtl, extern: "npegs$1".} = +func findAll*(s: string, pattern: Peg, start = 0): seq[string] {. + rtl, extern: "npegs$1".} = ## returns all matching *substrings* of `s` that match `pattern`. - ## If it does not match, @[] is returned. + ## If it does not match, `@[]` is returned. result = @[] for it in findAll(s, pattern, start): result.add it -when not defined(nimhygiene): - {.pragma: inject.} - template `=~`*(s: string, pattern: Peg): bool = ## This calls ``match`` with an implicit declared ``matches`` array that ## can be used in the scope of the ``=~`` call: ## - ## .. code-block:: nim - ## + ## ```nim ## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}": ## # matches a key=value pair: ## echo("Key: ", matches[0]) @@ -1177,50 +1195,51 @@ template `=~`*(s: string, pattern: Peg): bool = ## echo("comment: ", matches[0]) ## else: ## echo("syntax error") - ## + ## ``` bind MaxSubpatterns when not declaredInScope(matches): - var matches {.inject.}: array[0..MaxSubpatterns-1, string] + var matches {.inject.} = default(array[0..MaxSubpatterns-1, string]) match(s, pattern, matches) # ------------------------- more string handling ------------------------------ -proc contains*(s: string, pattern: Peg, start = 0): bool {. - noSideEffect, rtl, extern: "npegs$1".} = +func contains*(s: string, pattern: Peg, start = 0): bool {. + rtl, extern: "npegs$1".} = ## same as ``find(s, pattern, start) >= 0`` return find(s, pattern, start) >= 0 -proc contains*(s: string, pattern: Peg, matches: var openArray[string], - start = 0): bool {.noSideEffect, rtl, extern: "npegs$1Capture".} = +func contains*(s: string, pattern: Peg, matches: var openArray[string], + start = 0): bool {.rtl, extern: "npegs$1Capture".} = ## same as ``find(s, pattern, matches, start) >= 0`` return find(s, pattern, matches, start) >= 0 -proc startsWith*(s: string, prefix: Peg, start = 0): bool {. - noSideEffect, rtl, extern: "npegs$1".} = +func startsWith*(s: string, prefix: Peg, start = 0): bool {. + rtl, extern: "npegs$1".} = ## returns true if `s` starts with the pattern `prefix` result = matchLen(s, prefix, start) >= 0 -proc endsWith*(s: string, suffix: Peg, start = 0): bool {. - noSideEffect, rtl, extern: "npegs$1".} = +func endsWith*(s: string, suffix: Peg, start = 0): bool {. + rtl, extern: "npegs$1".} = ## returns true if `s` ends with the pattern `suffix` var c: Captures c.origStart = start for i in start .. s.len-1: if rawMatch(s, suffix, i, c) == s.len - i: return true -proc replacef*(s: string, sub: Peg, by: string): string {. - noSideEffect, rtl, extern: "npegs$1".} = +func replacef*(s: string, sub: Peg, by: string): string {. + rtl, extern: "npegs$1".} = ## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by` ## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples: ## - ## .. code-block:: nim + ## ```nim ## "var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2") + ## ``` ## ## Results in: ## - ## .. code-block:: nim - ## + ## ```nim ## "var1<-keykey; val2<-key2key2" + ## ``` result = "" var i = 0 var caps: array[0..MaxSubpatterns-1, string] @@ -1237,8 +1256,8 @@ proc replacef*(s: string, sub: Peg, by: string): string {. inc(i, x) add(result, substr(s, i)) -proc replace*(s: string, sub: Peg, by = ""): string {. - noSideEffect, rtl, extern: "npegs$1".} = +func replace*(s: string, sub: Peg, by = ""): string {. + rtl, extern: "npegs$1".} = ## Replaces `sub` in `s` by the string `by`. Captures cannot be accessed ## in `by`. result = "" @@ -1254,9 +1273,9 @@ proc replace*(s: string, sub: Peg, by = ""): string {. inc(i, x) add(result, substr(s, i)) -proc parallelReplace*(s: string, subs: varargs[ +func parallelReplace*(s: string, subs: varargs[ tuple[pattern: Peg, repl: string]]): string {. - noSideEffect, rtl, extern: "npegs$1".} = + rtl, extern: "npegs$1".} = ## Returns a modified copy of `s` with the substitutions in `subs` ## applied in parallel. result = "" @@ -1278,16 +1297,18 @@ proc parallelReplace*(s: string, subs: varargs[ # copy the rest: add(result, substr(s, i)) -proc replace*(s: string, sub: Peg, cb: proc( +when not defined(nimHasEffectsOf): + {.pragma: effectsOf.} + +func replace*(s: string, sub: Peg, cb: proc( match: int, cnt: int, caps: openArray[string]): string): string {. - rtl, extern: "npegs$1cb".} = + rtl, extern: "npegs$1cb", effectsOf: cb.} = ## Replaces `sub` in `s` by the resulting strings from the callback. ## The callback proc receives the index of the current match (starting with 0), ## the count of captures and an open array with the captures of each match. Examples: ## - ## .. code-block:: nim - ## - ## proc handleMatches*(m: int, n: int, c: openArray[string]): string = + ## ```nim + ## func handleMatches*(m: int, n: int, c: openArray[string]): string = ## result = "" ## if m > 0: ## result.add ", " @@ -1298,12 +1319,13 @@ proc replace*(s: string, sub: Peg, cb: proc( ## ## let s = "Var1=key1;var2=Key2; VAR3" ## echo s.replace(peg"{\ident}('='{\ident})* ';'* \s*", handleMatches) + ## ``` ## ## Results in: ## - ## .. code-block:: nim - ## + ## ```nim ## "var1: 'key1', var2: 'Key2', var3: ''" + ## ``` result = "" var i = 0 var caps: array[0..MaxSubpatterns-1, string] @@ -1331,7 +1353,7 @@ when not defined(js): ## error occurs. This is supposed to be used for quick scripting. ## ## **Note**: this proc does not exist while using the JS backend. - var x = readFile(infile).string + var x = readFile(infile) writeFile(outfile, x.parallelReplace(subs)) @@ -1341,18 +1363,19 @@ iterator split*(s: string, sep: Peg): string = ## Substrings are separated by the PEG `sep`. ## Examples: ## - ## .. code-block:: nim + ## ```nim ## for word in split("00232this02939is39an22example111", peg"\d+"): ## writeLine(stdout, word) + ## ``` ## ## Results in: ## - ## .. code-block:: nim + ## ```nim ## "this" ## "is" ## "an" ## "example" - ## + ## ``` var c: Captures var first = 0 @@ -1370,8 +1393,8 @@ iterator split*(s: string, sep: Peg): string = if first < last: yield substr(s, first, last-1) -proc split*(s: string, sep: Peg): seq[string] {. - noSideEffect, rtl, extern: "npegs$1".} = +func split*(s: string, sep: Peg): seq[string] {. + rtl, extern: "npegs$1".} = ## Splits the string `s` into substrings. result = @[] for it in split(s, sep): result.add it @@ -1397,6 +1420,7 @@ type tkCurlyLe, ## '{' tkCurlyRi, ## '}' tkCurlyAt, ## '{@}' + tkEmptyCurl, ## '{}' tkArrow, ## '<-' tkBar, ## '/' tkStar, ## '*' @@ -1429,25 +1453,25 @@ type const tokKindToStr: array[TokKind, string] = [ "invalid", "[EOF]", ".", "_", "identifier", "string literal", - "character set", "(", ")", "{", "}", "{@}", + "character set", "(", ")", "{", "}", "{@}", "{}", "<-", "/", "*", "+", "&", "!", "?", "@", "built-in", "escaped", "$", "$", "^" ] -proc handleCR(L: var PegLexer, pos: int): int = +func handleCR(L: var PegLexer, pos: int): int = assert(L.buf[pos] == '\c') inc(L.lineNumber) result = pos+1 if result < L.buf.len and L.buf[result] == '\L': inc(result) L.lineStart = result -proc handleLF(L: var PegLexer, pos: int): int = +func handleLF(L: var PegLexer, pos: int): int = assert(L.buf[pos] == '\L') inc(L.lineNumber) result = pos+1 L.lineStart = result -proc init(L: var PegLexer, input, filename: string, line = 1, col = 0) = +func init(L: var PegLexer, input, filename: string, line = 1, col = 0) = L.buf = input L.bufpos = 0 L.lineNumber = line @@ -1455,31 +1479,18 @@ proc init(L: var PegLexer, input, filename: string, line = 1, col = 0) = L.lineStart = 0 L.filename = filename -proc getColumn(L: PegLexer): int {.inline.} = +func getColumn(L: PegLexer): int {.inline.} = result = abs(L.bufpos - L.lineStart) + L.colOffset -proc getLine(L: PegLexer): int {.inline.} = +func getLine(L: PegLexer): int {.inline.} = result = L.lineNumber -proc errorStr(L: PegLexer, msg: string, line = -1, col = -1): string = +func errorStr(L: PegLexer, msg: string, line = -1, col = -1): string = var line = if line < 0: getLine(L) else: line var col = if col < 0: getColumn(L) else: col result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg] -proc handleHexChar(c: var PegLexer, xi: var int) = - case c.buf[c.bufpos] - of '0'..'9': - xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('0')) - inc(c.bufpos) - of 'a'..'f': - xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('a') + 10) - inc(c.bufpos) - of 'A'..'F': - xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('A') + 10) - inc(c.bufpos) - else: discard - -proc getEscapedChar(c: var PegLexer, tok: var Token) = +func getEscapedChar(c: var PegLexer, tok: var Token) = inc(c.bufpos) if c.bufpos >= len(c.buf): tok.kind = tkInvalid @@ -1515,8 +1526,10 @@ proc getEscapedChar(c: var PegLexer, tok: var Token) = tok.kind = tkInvalid return var xi = 0 - handleHexChar(c, xi) - handleHexChar(c, xi) + if handleHexChar(c.buf[c.bufpos], xi): + inc(c.bufpos) + if handleHexChar(c.buf[c.bufpos], xi): + inc(c.bufpos) if xi == 0: tok.kind = tkInvalid else: add(tok.literal, chr(xi)) of '0'..'9': @@ -1537,7 +1550,7 @@ proc getEscapedChar(c: var PegLexer, tok: var Token) = add(tok.literal, c.buf[c.bufpos]) inc(c.bufpos) -proc skip(c: var PegLexer) = +func skip(c: var PegLexer) = var pos = c.bufpos while pos < c.buf.len: case c.buf[pos] @@ -1554,7 +1567,7 @@ proc skip(c: var PegLexer) = break # EndOfFile also leaves the loop c.bufpos = pos -proc getString(c: var PegLexer, tok: var Token) = +func getString(c: var PegLexer, tok: var Token) = tok.kind = tkStringLit var pos = c.bufpos + 1 var quote = c.buf[pos-1] @@ -1575,19 +1588,27 @@ proc getString(c: var PegLexer, tok: var Token) = inc(pos) c.bufpos = pos -proc getDollar(c: var PegLexer, tok: var Token) = +func getDollar(c: var PegLexer, tok: var Token) = var pos = c.bufpos + 1 + var neg = false + if pos < c.buf.len and c.buf[pos] == '^': + neg = true + inc(pos) if pos < c.buf.len and c.buf[pos] in {'0'..'9'}: tok.kind = tkBackref tok.index = 0 while pos < c.buf.len and c.buf[pos] in {'0'..'9'}: tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0') inc(pos) + if neg: + tok.index = -tok.index else: + if neg: + dec(pos) tok.kind = tkDollar c.bufpos = pos -proc getCharSet(c: var PegLexer, tok: var Token) = +func getCharSet(c: var PegLexer, tok: var Token) = tok.kind = tkCharSet tok.charset = {} var pos = c.bufpos + 1 @@ -1644,7 +1665,7 @@ proc getCharSet(c: var PegLexer, tok: var Token) = c.bufpos = pos if caret: tok.charset = {'\1'..'\xFF'} - tok.charset -proc getSymbol(c: var PegLexer, tok: var Token) = +func getSymbol(c: var PegLexer, tok: var Token) = var pos = c.bufpos while pos < c.buf.len: add(tok.literal, c.buf[pos]) @@ -1653,7 +1674,7 @@ proc getSymbol(c: var PegLexer, tok: var Token) = c.bufpos = pos tok.kind = tkIdentifier -proc getBuiltin(c: var PegLexer, tok: var Token) = +func getBuiltin(c: var PegLexer, tok: var Token) = if c.bufpos+1 < c.buf.len and c.buf[c.bufpos+1] in strutils.Letters: inc(c.bufpos) getSymbol(c, tok) @@ -1662,7 +1683,7 @@ proc getBuiltin(c: var PegLexer, tok: var Token) = tok.kind = tkEscaped getEscapedChar(c, tok) # may set tok.kind to tkInvalid -proc getTok(c: var PegLexer, tok: var Token) = +func getTok(c: var PegLexer, tok: var Token) = tok.kind = tkInvalid tok.modifier = modNone setLen(tok.literal, 0) @@ -1683,6 +1704,10 @@ proc getTok(c: var PegLexer, tok: var Token) = tok.kind = tkCurlyAt inc(c.bufpos, 2) add(tok.literal, "{@}") + elif c.buf[c.bufpos] == '}' and c.bufpos < c.buf.len: + tok.kind = tkEmptyCurl + inc(c.bufpos) + add(tok.literal, "{}") else: tok.kind = tkCurlyLe add(tok.literal, '{') @@ -1718,7 +1743,7 @@ proc getTok(c: var PegLexer, tok: var Token) = return if c.buf[c.bufpos] in {'\'', '"'} or c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and - c.buf[c.bufpos+1] in {'0'..'9'}: + c.buf[c.bufpos+1] in {'^', '0'..'9'}: case tok.literal of "i": tok.modifier = modIgnoreCase of "y": tok.modifier = modIgnoreStyle @@ -1780,7 +1805,7 @@ proc getTok(c: var PegLexer, tok: var Token) = add(tok.literal, c.buf[c.bufpos]) inc(c.bufpos) -proc arrowIsNextTok(c: PegLexer): bool = +func arrowIsNextTok(c: PegLexer): bool = # the only look ahead we need var pos = c.bufpos while pos < c.buf.len and c.buf[pos] in {'\t', ' '}: inc(pos) @@ -1801,23 +1826,21 @@ type identIsVerbatim: bool skip: Peg -proc pegError(p: PegParser, msg: string, line = -1, col = -1) = - var e: ref EInvalidPeg - new(e) - e.msg = errorStr(p, msg, line, col) +func pegError(p: PegParser, msg: string, line = -1, col = -1) = + var e = (ref EInvalidPeg)(msg: errorStr(p, msg, line, col)) raise e -proc getTok(p: var PegParser) = +func getTok(p: var PegParser) = getTok(p, p.tok) if p.tok.kind == tkInvalid: pegError(p, "'" & p.tok.literal & "' is invalid token") -proc eat(p: var PegParser, kind: TokKind) = +func eat(p: var PegParser, kind: TokKind) = if p.tok.kind == kind: getTok(p) else: pegError(p, tokKindToStr[kind] & " expected") -proc parseExpr(p: var PegParser): Peg {.gcsafe.} +func parseExpr(p: var PegParser): Peg {.gcsafe.} -proc getNonTerminal(p: var PegParser, name: string): NonTerminal = +func getNonTerminal(p: var PegParser, name: string): NonTerminal = for i in 0..high(p.nonterms): result = p.nonterms[i] if cmpIgnoreStyle(result.name, name) == 0: return @@ -1825,19 +1848,22 @@ proc getNonTerminal(p: var PegParser, name: string): NonTerminal = result = newNonTerminal(name, getLine(p), getColumn(p)) add(p.nonterms, result) -proc modifiedTerm(s: string, m: Modifier): Peg = +func modifiedTerm(s: string, m: Modifier): Peg = case m of modNone, modVerbatim: result = term(s) of modIgnoreCase: result = termIgnoreCase(s) of modIgnoreStyle: result = termIgnoreStyle(s) -proc modifiedBackref(s: int, m: Modifier): Peg = +func modifiedBackref(s: int, m: Modifier): Peg = + var + reverse = s < 0 + index = if reverse: -s else: s case m - of modNone, modVerbatim: result = backref(s) - of modIgnoreCase: result = backrefIgnoreCase(s) - of modIgnoreStyle: result = backrefIgnoreStyle(s) + of modNone, modVerbatim: result = backref(index, reverse) + of modIgnoreCase: result = backrefIgnoreCase(index, reverse) + of modIgnoreStyle: result = backrefIgnoreStyle(index, reverse) -proc builtin(p: var PegParser): Peg = +func builtin(p: var PegParser): Peg = # do not use "y", "skip" or "i" as these would be ambiguous case p.tok.literal of "n": result = newLine() @@ -1857,11 +1883,11 @@ proc builtin(p: var PegParser): Peg = of "white": result = unicodeWhitespace() else: pegError(p, "unknown built-in: " & p.tok.literal) -proc token(terminal: Peg, p: PegParser): Peg = +func token(terminal: Peg, p: PegParser): Peg = if p.skip.kind == pkEmpty: result = terminal else: result = sequence(p.skip, terminal) -proc primary(p: var PegParser): Peg = +func primary(p: var PegParser): Peg = case p.tok.kind of tkAmp: getTok(p) @@ -1885,7 +1911,8 @@ proc primary(p: var PegParser): Peg = getTok(p) elif not arrowIsNextTok(p): var nt = getNonTerminal(p, p.tok.literal) - incl(nt.flags, ntUsed) + {.cast(noSideEffect).}: + incl(nt.flags, ntUsed) result = nonterminal(nt).token(p) getTok(p) else: @@ -1909,6 +1936,9 @@ proc primary(p: var PegParser): Peg = result = capture(parseExpr(p)).token(p) eat(p, tkCurlyRi) inc(p.captures) + of tkEmptyCurl: + result = capture() + getTok(p) of tkAny: result = any().token(p) getTok(p) @@ -1928,11 +1958,11 @@ proc primary(p: var PegParser): Peg = result = startAnchor() getTok(p) of tkBackref: + if abs(p.tok.index) > p.captures or p.tok.index == 0: + pegError(p, "invalid back reference index: " & $p.tok.index) var m = p.tok.modifier if m == modNone: m = p.modifier result = modifiedBackref(p.tok.index, m).token(p) - if p.tok.index < 0 or p.tok.index > p.captures: - pegError(p, "invalid back reference index: " & $p.tok.index) getTok(p) else: pegError(p, "expression expected, but found: " & p.tok.literal) @@ -1950,13 +1980,13 @@ proc primary(p: var PegParser): Peg = getTok(p) else: break -proc seqExpr(p: var PegParser): Peg = +func seqExpr(p: var PegParser): Peg = result = primary(p) while true: case p.tok.kind of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe, tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref, - tkHat, tkCurlyAt: + tkHat, tkCurlyAt, tkEmptyCurl: result = sequence(result, primary(p)) of tkIdentifier: if not arrowIsNextTok(p): @@ -1964,27 +1994,29 @@ proc seqExpr(p: var PegParser): Peg = else: break else: break -proc parseExpr(p: var PegParser): Peg = +func parseExpr(p: var PegParser): Peg = result = seqExpr(p) while p.tok.kind == tkBar: getTok(p) result = result / seqExpr(p) -proc parseRule(p: var PegParser): NonTerminal = +func parseRule(p: var PegParser): NonTerminal = if p.tok.kind == tkIdentifier and arrowIsNextTok(p): result = getNonTerminal(p, p.tok.literal) if ntDeclared in result.flags: pegError(p, "attempt to redefine: " & result.name) - result.line = getLine(p) - result.col = getColumn(p) + {.cast(noSideEffect).}: + result.line = getLine(p) + result.col = getColumn(p) getTok(p) eat(p, tkArrow) - result.rule = parseExpr(p) - incl(result.flags, ntDeclared) # NOW inlining may be attempted + {.cast(noSideEffect).}: + result.rule = parseExpr(p) + incl(result.flags, ntDeclared) # NOW inlining may be attempted else: pegError(p, "rule expected, but found: " & p.tok.literal) -proc rawParse(p: var PegParser): Peg = +func rawParse(p: var PegParser): Peg = ## parses a rule or a PEG expression while p.tok.kind == tkBuiltin: case p.tok.literal @@ -2014,7 +2046,7 @@ proc rawParse(p: var PegParser): Peg = elif ntUsed notin nt.flags and i > 0: pegError(p, "unused rule: " & nt.name, nt.line, nt.col) -proc parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): Peg = +func parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): Peg = ## constructs a Peg object from `pattern`. `filename`, `line`, `col` are ## used for error messages, but they only provide start offsets. `parsePeg` ## keeps track of line and column numbers within `pattern`. @@ -2029,14 +2061,14 @@ proc parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): Peg = getTok(p) result = rawParse(p) -proc peg*(pattern: string): Peg = +func peg*(pattern: string): Peg = ## constructs a Peg object from the `pattern`. The short name has been - ## chosen to encourage its use as a raw string modifier:: + ## chosen to encourage its use as a raw string modifier: ## - ## peg"{\ident} \s* '=' \s* {.*}" + ## peg"{\ident} \s* '=' \s* {.*}" result = parsePeg(pattern, "pattern") -proc escapePeg*(s: string): string = +func escapePeg*(s: string): string = ## escapes `s` so that it is matched verbatim when used as a peg. result = "" var inQuote = false @@ -2054,147 +2086,3 @@ proc escapePeg*(s: string): string = inQuote = true result.add(c) if inQuote: result.add('\'') - -when isMainModule: - proc pegsTest() = - assert escapePeg("abc''def'") == r"'abc'\x27\x27'def'\x27" - assert match("(a b c)", peg"'(' @ ')'") - assert match("W_HI_Le", peg"\y 'while'") - assert(not match("W_HI_L", peg"\y 'while'")) - assert(not match("W_HI_Le", peg"\y v'while'")) - assert match("W_HI_Le", peg"y'while'") - - assert($ +digits == $peg"\d+") - assert "0158787".match(peg"\d+") - assert "ABC 0232".match(peg"\w+\s+\d+") - assert "ABC".match(peg"\d+ / \w+") - - var accum: seq[string] = @[] - for word in split("00232this02939is39an22example111", peg"\d+"): - accum.add(word) - assert(accum == @["this", "is", "an", "example"]) - - assert matchLen("key", ident) == 3 - - var pattern = sequence(ident, *whitespace, term('='), *whitespace, ident) - assert matchLen("key1= cal9", pattern) == 11 - - var ws = newNonTerminal("ws", 1, 1) - ws.rule = *whitespace - - var expr = newNonTerminal("expr", 1, 1) - expr.rule = sequence(capture(ident), *sequence( - nonterminal(ws), term('+'), nonterminal(ws), nonterminal(expr))) - - var c: Captures - var s = "a+b + c +d+e+f" - assert rawMatch(s, expr.rule, 0, c) == len(s) - var a = "" - for i in 0..c.ml-1: - a.add(substr(s, c.matches[i][0], c.matches[i][1])) - assert a == "abcdef" - #echo expr.rule - - #const filename = "lib/devel/peg/grammar.txt" - #var grammar = parsePeg(newFileStream(filename, fmRead), filename) - #echo "a <- [abc]*?".match(grammar) - assert find("_____abc_______", term("abc"), 2) == 5 - assert match("_______ana", peg"A <- 'ana' / . A") - assert match("abcs%%%", peg"A <- ..A / .A / '%'") - - var matches: array[0..MaxSubpatterns-1, string] - if "abc" =~ peg"{'a'}'bc' 'xyz' / {\ident}": - assert matches[0] == "abc" - else: - assert false - - var g2 = peg"""S <- A B / C D - A <- 'a'+ - B <- 'b'+ - C <- 'c'+ - D <- 'd'+ - """ - assert($g2 == "((A B) / (C D))") - assert match("cccccdddddd", g2) - assert("var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2") == - "var1<-keykey; var2<-key2key2") - assert("var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") == - "$1<-$2$2; $1<-$2$2") - assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}") - - if "aaaaaa" =~ peg"'aa' !. / ({'a'})+": - assert matches[0] == "a" - else: - assert false - - if match("abcdefg", peg"c {d} ef {g}", matches, 2): - assert matches[0] == "d" - assert matches[1] == "g" - else: - assert false - - accum = @[] - for x in findAll("abcdef", peg".", 3): - accum.add(x) - assert(accum == @["d", "e", "f"]) - - for x in findAll("abcdef", peg"^{.}", 3): - assert x == "d" - - if "f(a, b)" =~ peg"{[0-9]+} / ({\ident} '(' {@} ')')": - assert matches[0] == "f" - assert matches[1] == "a, b" - else: - assert false - - assert match("eine übersicht und außerdem", peg"(\letter \white*)+") - # ß is not a lower cased letter?! - assert match("eine übersicht und auerdem", peg"(\lower \white*)+") - assert match("EINE ÜBERSICHT UND AUSSERDEM", peg"(\upper \white*)+") - assert(not match("456678", peg"(\letter)+")) - - assert("var1 = key; var2 = key2".replacef( - peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") == - "var1<-keykey;var2<-key2key2") - - assert match("prefix/start", peg"^start$", 7) - - if "foo" =~ peg"{'a'}?.*": - assert matches[0].len == 0 - else: assert false - - if "foo" =~ peg"{''}.*": - assert matches[0] == "" - else: assert false - - if "foo" =~ peg"{'foo'}": - assert matches[0] == "foo" - else: assert false - - let empty_test = peg"^\d*" - let str = "XYZ" - - assert(str.find(empty_test) == 0) - assert(str.match(empty_test)) - - proc handleMatches(m: int, n: int, c: openArray[string]): string = - result = "" - - if m > 0: - result.add ", " - - result.add case n: - of 2: toLowerAscii(c[0]) & ": '" & c[1] & "'" - of 1: toLowerAscii(c[0]) & ": ''" - else: "" - - assert("Var1=key1;var2=Key2; VAR3". - replace(peg"{\ident}('='{\ident})* ';'* \s*", - handleMatches) == "var1: 'key1', var2: 'Key2', var3: ''") - - - doAssert "test1".match(peg"""{@}$""") - doAssert "test2".match(peg"""{(!$ .)*} $""") - pegsTest() - static: - pegsTest() |