diff options
Diffstat (limited to 'lib/pure/pegs.nim')
-rw-r--r-- | lib/pure/pegs.nim | 821 |
1 files changed, 583 insertions, 238 deletions
diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim index 5ae2d9182..3ee82917d 100644 --- a/lib/pure/pegs.nim +++ b/lib/pure/pegs.nim @@ -20,11 +20,11 @@ include "system/inclrtl" const useUnicode = true ## change this to deactivate proper UTF-8 support -import - strutils +import strutils, macros when useUnicode: import unicode + export unicode.`==` const InlineThreshold = 5 ## number of leaves; -1 to disable inlining @@ -32,7 +32,7 @@ const ## can be captured. More subpatterns cannot be captured! type - PegKind = enum + PegKind* = enum pkEmpty, pkAny, ## any character (.) pkAnyRune, ## any Unicode character (_) @@ -67,15 +67,15 @@ type pkRule, ## a <- b pkList, ## a, b pkStartAnchor ## ^ --> Internal DSL: startAnchor() - NonTerminalFlag = enum + NonTerminalFlag* = enum ntDeclared, ntUsed NonTerminalObj = object ## represents a non terminal symbol name: string ## the name of the symbol line: int ## line the symbol has been declared/used in col: int ## column the symbol has been declared/used in flags: set[NonTerminalFlag] ## the nonterminal's flags - rule: Node ## the rule that the symbol refers to - Node {.shallow.} = object + rule: Peg ## the rule that the symbol refers to + Peg* {.shallow.} = object ## type that represents a PEG case kind: PegKind of pkEmpty..pkWhitespace: nil of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string @@ -83,12 +83,61 @@ type of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char] of pkNonTerminal: nt: NonTerminal of pkBackRef..pkBackRefIgnoreStyle: index: range[0..MaxSubpatterns] - else: sons: seq[Node] + else: sons: seq[Peg] NonTerminal* = ref NonTerminalObj - Peg* = Node ## type that represents a PEG +proc kind*(p: Peg): PegKind = p.kind + ## Returns the *PegKind* of a given *Peg* object. -{.deprecated: [TPeg: Peg, TNode: Node].} +proc term*(p: Peg): string = p.term + ## Returns the *string* representation of a given *Peg* variant object + ## where present. + +proc ch*(p: Peg): char = p.ch + ## Returns the *char* representation of a given *Peg* variant object + ## where present. + +proc charChoice*(p: Peg): ref set[char] = p.charChoice + ## Returns the *charChoice* field of a given *Peg* variant object + ## where present. + +proc nt*(p: Peg): NonTerminal = p.nt + ## Returns the *NonTerminal* object of a given *Peg* variant object + ## where present. + +proc index*(p: Peg): range[0..MaxSubpatterns] = p.index + ## Returns the back-reference index of a captured sub-pattern in the + ## *Captures* object for a given *Peg* variant object where present. + +iterator items*(p: Peg): Peg {.inline.} = + ## Yields the child nodes of a *Peg* variant object where present. + for s in p.sons: + yield s + +iterator pairs*(p: Peg): (int, Peg) {.inline.} = + ## Yields the indices and child nodes of a *Peg* variant object where present. + for i in 0 ..< p.sons.len: + yield (i, p.sons[i]) + +proc name*(nt: NonTerminal): string = nt.name + ## Gets the name of the symbol represented by the parent *Peg* object variant + ## of a given *NonTerminal*. + +proc line*(nt: NonTerminal): int = nt.line + ## Gets the line number of the definition of the parent *Peg* object variant + ## of a given *NonTerminal*. + +proc col*(nt: NonTerminal): int = nt.col + ## Gets the column number of the definition of the parent *Peg* object variant + ## of a given *NonTerminal*. + +proc flags*(nt: NonTerminal): set[NonTerminalFlag] = nt.flags + ## Gets the *NonTerminalFlag*-typed flags field of the parent *Peg* variant + ## object of a given *NonTerminal*. + +proc rule*(nt: NonTerminal): Peg = nt.rule + ## Gets the *Peg* object representing the rule definition of the parent *Peg* + ## object variant of a given *NonTerminal*. proc term*(t: string): Peg {.nosideEffect, rtl, extern: "npegs$1Str".} = ## constructs a PEG from a terminal string @@ -525,215 +574,497 @@ when not useUnicode: proc isTitle(a: char): bool {.inline.} = return false proc isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'} -proc rawMatch*(s: string, p: Peg, start: int, c: var Captures): int {. - nosideEffect, rtl, extern: "npegs$1".} = - ## low-level matching proc that implements the PEG interpreter. Use this - ## for maximum efficiency (every other PEG operation ends up calling this - ## proc). - ## Returns -1 if it does not match, else the length of the match - case p.kind - of pkEmpty: result = 0 # match of length 0 - of pkAny: - if s[start] != '\0': result = 1 - else: result = -1 - of pkAnyRune: - if s[start] != '\0': - result = runeLenAt(s, start) - else: - result = -1 - of pkLetter: - if s[start] != '\0': - var a: Rune - result = start - fastRuneAt(s, result, a) - if isAlpha(a): dec(result, start) +template matchOrParse(mopProc: untyped): typed = + # Used to make the main matcher proc *rawMatch* as well as event parser + # procs. For the former, *enter* and *leave* event handler code generators + # are provided which just return *discard*. + + proc mopProc(s: string, p: Peg, start: int, c: var Captures): int = + proc matchBackRef(s: string, p: Peg, start: int, c: var Captures): int = + # Parse handler code must run in an *of* clause of its own for each + # *PegKind*, so we encapsulate the identical clause body for + # *pkBackRef..pkBackRefIgnoreStyle* here. + if p.index >= c.ml: return -1 + var (a, b) = c.matches[p.index] + var n: Peg + n.kind = succ(pkTerminal, ord(p.kind)-ord(pkBackRef)) + n.term = s.substr(a, b) + mopProc(s, n, start, c) + + case p.kind + of pkEmpty: + enter(pkEmpty, s, p, start) + result = 0 # match of length 0 + leave(pkEmpty, s, p, start, result) + of pkAny: + enter(pkAny, s, p, start) + if start < s.len: result = 1 else: result = -1 - else: - result = -1 - of pkLower: - if s[start] != '\0': - var a: Rune - result = start - fastRuneAt(s, result, a) - if isLower(a): dec(result, start) + leave(pkAny, s, p, start, result) + of pkAnyRune: + enter(pkAnyRune, s, p, start) + if start < s.len: + result = runeLenAt(s, start) + else: + result = -1 + leave(pkAnyRune, s, p, start, result) + of pkLetter: + enter(pkLetter, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isAlpha(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkLetter, s, p, start, result) + of pkLower: + enter(pkLower, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isLower(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkLower, s, p, start, result) + of pkUpper: + enter(pkUpper, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isUpper(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkUpper, s, p, start, result) + of pkTitle: + enter(pkTitle, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isTitle(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkTitle, s, p, start, result) + of pkWhitespace: + enter(pkWhitespace, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isWhiteSpace(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkWhitespace, s, p, start, result) + of pkGreedyAny: + enter(pkGreedyAny, s, p, start) + result = len(s) - start + leave(pkGreedyAny, s, p, start, result) + of pkNewLine: + enter(pkNewLine, s, p, start) + if start < s.len and s[start] == '\L': result = 1 + elif start < s.len and s[start] == '\C': + if start+1 < s.len and s[start+1] == '\L': result = 2 + else: result = 1 else: result = -1 - else: - result = -1 - of pkUpper: - if s[start] != '\0': - var a: Rune + leave(pkNewLine, s, p, start, result) + of pkTerminal: + enter(pkTerminal, s, p, start) + result = len(p.term) + for i in 0..result-1: + if start+i >= s.len or p.term[i] != s[start+i]: + result = -1 + break + leave(pkTerminal, s, p, start, result) + of pkTerminalIgnoreCase: + enter(pkTerminalIgnoreCase, s, p, start) + var + i = 0 + a, b: Rune result = start - fastRuneAt(s, result, a) - if isUpper(a): dec(result, start) - else: result = -1 - else: - result = -1 - of pkTitle: - if s[start] != '\0': - var a: Rune + while i < len(p.term): + if result >= s.len: + result = -1 + break + fastRuneAt(p.term, i, a) + fastRuneAt(s, result, b) + if toLower(a) != toLower(b): + result = -1 + break + dec(result, start) + leave(pkTerminalIgnoreCase, s, p, start, result) + of pkTerminalIgnoreStyle: + enter(pkTerminalIgnoreStyle, s, p, start) + var + i = 0 + a, b: Rune result = start - fastRuneAt(s, result, a) - if isTitle(a): dec(result, start) + while i < len(p.term): + while i < len(p.term): + fastRuneAt(p.term, i, a) + if a != Rune('_'): break + while result < s.len: + fastRuneAt(s, result, b) + if b != Rune('_'): break + if result >= s.len: + if i >= p.term.len: break + else: + result = -1 + break + elif toLower(a) != toLower(b): + result = -1 + break + dec(result, start) + leave(pkTerminalIgnoreStyle, s, p, start, result) + of pkChar: + enter(pkChar, s, p, start) + if start < s.len and p.ch == s[start]: result = 1 else: result = -1 - else: - result = -1 - of pkWhitespace: - if s[start] != '\0': - var a: Rune - result = start - fastRuneAt(s, result, a) - if isWhiteSpace(a): dec(result, start) + leave(pkChar, s, p, start, result) + of pkCharChoice: + enter(pkCharChoice, s, p, start) + if start < s.len and contains(p.charChoice[], s[start]): result = 1 else: result = -1 - else: + leave(pkCharChoice, s, p, start, result) + of pkNonTerminal: + enter(pkNonTerminal, s, p, start) + var oldMl = c.ml + when false: echo "enter: ", p.nt.name + result = mopProc(s, p.nt.rule, start, c) + when false: echo "leave: ", p.nt.name + if result < 0: c.ml = oldMl + leave(pkNonTerminal, s, p, start, result) + of pkSequence: + enter(pkSequence, s, p, start) + var oldMl = c.ml + result = 0 + for i in 0..high(p.sons): + var x = mopProc(s, p.sons[i], start+result, c) + if x < 0: + c.ml = oldMl + result = -1 + break + else: inc(result, x) + leave(pkSequence, s, p, start, result) + of pkOrderedChoice: + enter(pkOrderedChoice, s, p, start) + var oldMl = c.ml + for i in 0..high(p.sons): + result = mopProc(s, p.sons[i], start, c) + if result >= 0: break + c.ml = oldMl + leave(pkOrderedChoice, s, p, start, result) + of pkSearch: + enter(pkSearch, s, p, start) + var oldMl = c.ml + result = 0 + while start+result <= s.len: + var x = mopProc(s, p.sons[0], start+result, c) + if x >= 0: + inc(result, x) + leave(pkSearch, s, p, start, result) + return + inc(result) result = -1 - of pkGreedyAny: - result = len(s) - start - of pkNewLine: - if s[start] == '\L': result = 1 - elif s[start] == '\C': - if s[start+1] == '\L': result = 2 - else: result = 1 - else: result = -1 - of pkTerminal: - result = len(p.term) - for i in 0..result-1: - if p.term[i] != s[start+i]: - result = -1 - break - of pkTerminalIgnoreCase: - var - i = 0 - a, b: Rune - result = start - while i < len(p.term): - fastRuneAt(p.term, i, a) - fastRuneAt(s, result, b) - if toLower(a) != toLower(b): - result = -1 - break - dec(result, start) - of pkTerminalIgnoreStyle: - var - i = 0 - a, b: Rune - result = start - while i < len(p.term): - while true: - fastRuneAt(p.term, i, a) - if a != Rune('_'): break + c.ml = oldMl + leave(pkSearch, s, p, start, result) + of pkCapturedSearch: + enter(pkCapturedSearch, s, p, start) + var idx = c.ml # reserve a slot for the subpattern + inc(c.ml) + result = 0 + while start+result <= s.len: + var x = mopProc(s, p.sons[0], start+result, c) + if x >= 0: + if idx < MaxSubpatterns: + c.matches[idx] = (start, start+result-1) + #else: silently ignore the capture + inc(result, x) + leave(pkCapturedSearch, s, p, start, result) + return + inc(result) + result = -1 + c.ml = idx + leave(pkCapturedSearch, s, p, start, result) + of pkGreedyRep: + enter(pkGreedyRep, s, p, start) + result = 0 while true: - fastRuneAt(s, result, b) - if b != Rune('_'): break - if toLower(a) != toLower(b): - result = -1 - break - dec(result, start) - of pkChar: - if p.ch == s[start]: result = 1 - else: result = -1 - of pkCharChoice: - if contains(p.charChoice[], s[start]): result = 1 - else: result = -1 - of pkNonTerminal: - var oldMl = c.ml - when false: echo "enter: ", p.nt.name - result = rawMatch(s, p.nt.rule, start, c) - when false: echo "leave: ", p.nt.name - if result < 0: c.ml = oldMl - of pkSequence: - var oldMl = c.ml - result = 0 - for i in 0..high(p.sons): - var x = rawMatch(s, p.sons[i], start+result, c) - if x < 0: + var x = mopProc(s, p.sons[0], start+result, c) + # if x == 0, we have an endless loop; so the correct behaviour would be + # not to break. But endless loops can be easily introduced: + # ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the + # expected thing in this case. + if x <= 0: break + inc(result, x) + leave(pkGreedyRep, s, p, start, result) + of pkGreedyRepChar: + enter(pkGreedyRepChar, s, p, start) + result = 0 + var ch = p.ch + while start+result < s.len and ch == s[start+result]: inc(result) + leave(pkGreedyRepChar, s, p, start, result) + of pkGreedyRepSet: + enter(pkGreedyRepSet, s, p, start) + result = 0 + while start+result < s.len and contains(p.charChoice[], s[start+result]): inc(result) + leave(pkGreedyRepSet, s, p, start, result) + of pkOption: + enter(pkOption, s, p, start) + result = max(0, mopProc(s, p.sons[0], start, c)) + leave(pkOption, s, p, start, result) + of pkAndPredicate: + enter(pkAndPredicate, s, p, start) + var oldMl = c.ml + result = mopProc(s, p.sons[0], start, c) + if result >= 0: result = 0 # do not consume anything + else: c.ml = oldMl + leave(pkAndPredicate, s, p, start, result) + of pkNotPredicate: + enter(pkNotPredicate, s, p, start) + var oldMl = c.ml + result = mopProc(s, p.sons[0], start, c) + if result < 0: result = 0 + else: c.ml = oldMl result = -1 - break - else: inc(result, x) - of pkOrderedChoice: - var oldMl = c.ml - for i in 0..high(p.sons): - result = rawMatch(s, p.sons[i], start, c) - if result >= 0: break - c.ml = oldMl - of pkSearch: - var oldMl = c.ml - result = 0 - while start+result <= s.len: - var x = rawMatch(s, p.sons[0], start+result, c) - if x >= 0: - inc(result, x) - return - inc(result) - result = -1 - c.ml = oldMl - of pkCapturedSearch: - var idx = c.ml # reserve a slot for the subpattern - inc(c.ml) - result = 0 - while start+result <= s.len: - var x = rawMatch(s, p.sons[0], start+result, c) - if x >= 0: + leave(pkNotPredicate, s, p, start, result) + of pkCapture: + enter(pkCapture, s, p, start) + var idx = c.ml # reserve a slot for the subpattern + inc(c.ml) + result = mopProc(s, p.sons[0], start, c) + if result >= 0: if idx < MaxSubpatterns: c.matches[idx] = (start, start+result-1) #else: silently ignore the capture - inc(result, x) - return - inc(result) - result = -1 - c.ml = idx - of pkGreedyRep: - result = 0 - while true: - var x = rawMatch(s, p.sons[0], start+result, c) - # if x == 0, we have an endless loop; so the correct behaviour would be - # not to break. But endless loops can be easily introduced: - # ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the - # expected thing in this case. - if x <= 0: break - inc(result, x) - of pkGreedyRepChar: - result = 0 - var ch = p.ch - while ch == s[start+result]: inc(result) - of pkGreedyRepSet: - result = 0 - while contains(p.charChoice[], s[start+result]): inc(result) - of pkOption: - result = max(0, rawMatch(s, p.sons[0], start, c)) - of pkAndPredicate: - var oldMl = c.ml - result = rawMatch(s, p.sons[0], start, c) - if result >= 0: result = 0 # do not consume anything - else: c.ml = oldMl - of pkNotPredicate: - var oldMl = c.ml - result = rawMatch(s, p.sons[0], start, c) - if result < 0: result = 0 - else: - c.ml = oldMl - result = -1 - of pkCapture: - var idx = c.ml # reserve a slot for the subpattern - inc(c.ml) - result = rawMatch(s, p.sons[0], start, c) - if result >= 0: - if idx < MaxSubpatterns: - c.matches[idx] = (start, start+result-1) - #else: silently ignore the capture - else: - c.ml = idx - of pkBackRef..pkBackRefIgnoreStyle: - if p.index >= c.ml: return -1 - var (a, b) = c.matches[p.index] - var n: Peg - n.kind = succ(pkTerminal, ord(p.kind)-ord(pkBackRef)) - n.term = s.substr(a, b) - result = rawMatch(s, n, start, c) - of pkStartAnchor: - if c.origStart == start: result = 0 - else: result = -1 - of pkRule, pkList: assert false + else: + c.ml = idx + leave(pkCapture, s, p, start, result) + of pkBackRef: + enter(pkBackRef, s, p, start) + result = matchBackRef(s, p, start, c) + leave(pkBackRef, s, p, start, result) + of pkBackRefIgnoreCase: + enter(pkBackRefIgnoreCase, s, p, start) + result = matchBackRef(s, p, start, c) + leave(pkBackRefIgnoreCase, s, p, start, result) + of pkBackRefIgnoreStyle: + enter(pkBackRefIgnoreStyle, s, p, start) + result = matchBackRef(s, p, start, c) + leave(pkBackRefIgnoreStyle, s, p, start, result) + of pkStartAnchor: + enter(pkStartAnchor, s, p, start) + if c.origStart == start: result = 0 + else: result = -1 + leave(pkStartAnchor, s, p, start, result) + of pkRule, pkList: assert false + +proc rawMatch*(s: string, p: Peg, start: int, c: var Captures): int + {.noSideEffect, rtl, extern: "npegs$1".} = + ## low-level matching proc that implements the PEG interpreter. Use this + ## for maximum efficiency (every other PEG operation ends up calling this + ## proc). + ## Returns -1 if it does not match, else the length of the match + + # Set the handler generators to produce do-nothing handlers. + template enter(pk, s, p, start) = + discard + template leave(pk, s, p, start, length) = + discard + matchOrParse(matchIt) + result = matchIt(s, p, start, c) + +macro mkHandlerTplts(handlers: untyped): untyped = + # Transforms the handler spec in *handlers* into handler templates. + # The AST structure of *handlers[0]*: + # + # .. code-block:: + # StmtList + # Call + # Ident "pkNonTerminal" + # StmtList + # Call + # Ident "enter" + # StmtList + # <handler code block> + # Call + # Ident "leave" + # StmtList + # <handler code block> + # Call + # Ident "pkChar" + # StmtList + # Call + # Ident "leave" + # StmtList + # <handler code block> + # ... + proc mkEnter(hdName, body: NimNode): NimNode = + quote do: + template `hdName`(s, p, start) = + let s {.inject.} = s + let p {.inject.} = p + let start {.inject.} = start + `body` + + template mkLeave(hdPostf, body) {.dirty.} = + # this has to be dirty to be able to capture *result* as *length* in + # *leaveXX* calls. + template `leave hdPostf`(s, p, start, length) = + body + + result = newStmtList() + for topCall in handlers[0]: + if nnkCall != topCall.kind: + error("Call syntax expected.", topCall) + let pegKind = topCall[0] + if nnkIdent != pegKind.kind: + error("PegKind expected.", pegKind) + if 2 == topCall.len: + for hdDef in topCall[1]: + if nnkCall != hdDef.kind: + error("Call syntax expected.", hdDef) + if nnkIdent != hdDef[0].kind: + error("Handler identifier expected.", hdDef[0]) + if 2 == hdDef.len: + let hdPostf = substr(pegKind.strVal, 2) + case hdDef[0].strVal + of "enter": + result.add mkEnter(newIdentNode("enter" & hdPostf), hdDef[1]) + of "leave": + result.add getAst(mkLeave(ident(hdPostf), hdDef[1])) + else: + error( + "Unsupported handler identifier, expected 'enter' or 'leave'.", + hdDef[0] + ) + +template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) = + ## Generates an interpreting event parser *proc* according to the specified + ## PEG AST and handler code blocks. The *proc* can be called with a string + ## to be parsed and will execute the handler code blocks whenever their + ## associated grammar element is matched. It returns -1 if the string does not + ## match, else the length of the total match. The following example code + ## evaluates an arithmetic expression defined by a simple PEG: + ## + ## .. code-block:: nim + ## import strutils, pegs + ## + ## let + ## pegAst = """ + ## Expr <- Sum + ## Sum <- Product (('+' / '-')Product)* + ## Product <- Value (('*' / '/')Value)* + ## Value <- [0-9]+ / '(' Expr ')' + ## """.peg + ## txt = "(5+3)/2-7*22" + ## + ## var + ## pStack: seq[string] = @[] + ## valStack: seq[float] = @[] + ## opStack = "" + ## let + ## parseArithExpr = pegAst.eventParser: + ## pkNonTerminal: + ## enter: + ## pStack.add p.nt.name + ## leave: + ## pStack.setLen pStack.high + ## if length > 0: + ## let matchStr = s.substr(start, start+length-1) + ## case p.nt.name + ## of "Value": + ## try: + ## valStack.add matchStr.parseFloat + ## echo valStack + ## except ValueError: + ## discard + ## of "Sum", "Product": + ## try: + ## let val = matchStr.parseFloat + ## except ValueError: + ## if valStack.len > 1 and opStack.len > 0: + ## valStack[^2] = case opStack[^1] + ## of '+': valStack[^2] + valStack[^1] + ## of '-': valStack[^2] - valStack[^1] + ## of '*': valStack[^2] * valStack[^1] + ## else: valStack[^2] / valStack[^1] + ## valStack.setLen valStack.high + ## echo valStack + ## opStack.setLen opStack.high + ## echo opStack + ## pkChar: + ## leave: + ## if length == 1 and "Value" != pStack[^1]: + ## let matchChar = s[start] + ## opStack.add matchChar + ## echo opStack + ## + ## let pLen = parseArithExpr(txt) + ## + ## The *handlers* parameter consists of code blocks for *PegKinds*, + ## which define the grammar elements of interest. Each block can contain + ## handler code to be executed when the parser enters and leaves text + ## matching the grammar element. An *enter* handler can access the specific + ## PEG AST node being matched as *p*, the entire parsed string as *s* + ## and the position of the matched text segment in *s* as *start*. A *leave* + ## handler can access *p*, *s*, *start* and also the length of the matched + ## text segment as *length*. For an unsuccessful match, the *enter* and + ## *leave* handlers will be executed, with *length* set to -1. + ## + ## Symbols declared in an *enter* handler can be made visible in the + ## corresponding *leave* handler by annotating them with an *inject* pragma. + proc rawParse(s: string, p: Peg, start: int, c: var Captures): int + {.genSym.} = + + # binding from *macros* + bind strVal + + mkHandlerTplts: + handlers + + macro enter(pegKind, s, pegNode, start: untyped): untyped = + # This is called by the matcher code in *matchOrParse* at the + # start of the code for a grammar element of kind *pegKind*. + # Expands to a call to the handler template if one was generated + # by *mkHandlerTplts*. + template mkDoEnter(hdPostf, s, pegNode, start) = + when declared(`enter hdPostf`): + `enter hdPostf`(s, pegNode, start): + else: + discard + let hdPostf = ident(substr(strVal(pegKind), 2)) + getAst(mkDoEnter(hdPostf, s, pegNode, start)) + + macro leave(pegKind, s, pegNode, start, length: untyped): untyped = + # Like *enter*, but called at the end of the matcher code for + # a grammar element of kind *pegKind*. + template mkDoLeave(hdPostf, s, pegNode, start, length) = + when declared(`leave hdPostf`): + `leave hdPostf`(s, pegNode, start, length): + else: + discard + let hdPostf = ident(substr(strVal(pegKind), 2)) + getAst(mkDoLeave(hdPostf, s, pegNode, start, length)) + + matchOrParse(parseIt) + parseIt(s, p, start, c) + + proc parser(s: string): int {.genSym.} = + # the proc to be returned + var + ms: array[MaxSubpatterns, (int, int)] + cs = Captures(matches: ms, ml: 0, origStart: 0) + rawParse(s, pegAst, 0, cs) + parser template fillMatches(s, caps, c) = for k in 0..c.ml-1: @@ -742,7 +1073,7 @@ template fillMatches(s, caps, c) = if startIdx != -1: caps[k] = substr(s, startIdx, endIdx) else: - caps[k] = nil + caps[k] = "" proc matchLen*(s: string, pattern: Peg, matches: var openArray[string], start = 0): int {.nosideEffect, rtl, extern: "npegs$1Capture".} = @@ -1006,14 +1337,18 @@ proc replace*(s: string, sub: Peg, cb: proc( inc(m) add(result, substr(s, i)) -proc transformFile*(infile, outfile: string, - subs: varargs[tuple[pattern: Peg, repl: string]]) {. - rtl, extern: "npegs$1".} = - ## reads in the file `infile`, performs a parallel replacement (calls - ## `parallelReplace`) and writes back to `outfile`. Raises ``EIO`` if an - ## error occurs. This is supposed to be used for quick scripting. - var x = readFile(infile).string - writeFile(outfile, x.parallelReplace(subs)) +when not defined(js): + proc transformFile*(infile, outfile: string, + subs: varargs[tuple[pattern: Peg, repl: string]]) {. + rtl, extern: "npegs$1".} = + ## reads in the file `infile`, performs a parallel replacement (calls + ## `parallelReplace`) and writes back to `outfile`. Raises ``EIO`` if an + ## error occurs. This is supposed to be used for quick scripting. + ## + ## **Note**: this proc does not exist while using the JS backend. + var x = readFile(infile).string + writeFile(outfile, x.parallelReplace(subs)) + iterator split*(s: string, sep: Peg): string = ## Splits the string `s` into substrings. @@ -1117,7 +1452,7 @@ proc handleCR(L: var PegLexer, pos: int): int = assert(L.buf[pos] == '\c') inc(L.lineNumber) result = pos+1 - if L.buf[result] == '\L': inc(result) + if result < L.buf.len and L.buf[result] == '\L': inc(result) L.lineStart = result proc handleLF(L: var PegLexer, pos: int): int = @@ -1213,12 +1548,13 @@ proc getEscapedChar(c: var PegLexer, tok: var Token) = proc skip(c: var PegLexer) = var pos = c.bufpos var buf = c.buf - while true: + while pos < c.buf.len: case buf[pos] of ' ', '\t': inc(pos) of '#': - while not (buf[pos] in {'\c', '\L', '\0'}): inc(pos) + while (pos < c.buf.len) and + not (buf[pos] in {'\c', '\L', '\0'}): inc(pos) of '\c': pos = handleCR(c, pos) buf = c.buf @@ -1234,7 +1570,7 @@ proc getString(c: var PegLexer, tok: var Token) = var pos = c.bufpos + 1 var buf = c.buf var quote = buf[pos-1] - while true: + while pos < c.buf.len: case buf[pos] of '\\': c.bufpos = pos @@ -1257,7 +1593,7 @@ proc getDollar(c: var PegLexer, tok: var Token) = if buf[pos] in {'0'..'9'}: tok.kind = tkBackref tok.index = 0 - while buf[pos] in {'0'..'9'}: + while pos < c.buf.len and buf[pos] in {'0'..'9'}: tok.index = tok.index * 10 + ord(buf[pos]) - ord('0') inc(pos) else: @@ -1273,11 +1609,11 @@ proc getCharSet(c: var PegLexer, tok: var Token) = if buf[pos] == '^': inc(pos) caret = true - while true: + while pos < c.buf.len: var ch: char case buf[pos] of ']': - inc(pos) + if pos < c.buf.len: inc(pos) break of '\\': c.bufpos = pos @@ -1292,11 +1628,14 @@ proc getCharSet(c: var PegLexer, tok: var Token) = inc(pos) incl(tok.charset, ch) if buf[pos] == '-': - if buf[pos+1] == ']': + if pos+1 < c.buf.len and buf[pos+1] == ']': incl(tok.charset, '-') inc(pos) else: - inc(pos) + if pos+1 < c.buf.len: + inc(pos) + else: + break var ch2: char case buf[pos] of '\\': @@ -1308,8 +1647,11 @@ proc getCharSet(c: var PegLexer, tok: var Token) = tok.kind = tkInvalid break else: - ch2 = buf[pos] - inc(pos) + if pos+1 < c.buf.len: + ch2 = buf[pos] + inc(pos) + else: + break for i in ord(ch)+1 .. ord(ch2): incl(tok.charset, chr(i)) c.bufpos = pos @@ -1318,15 +1660,15 @@ proc getCharSet(c: var PegLexer, tok: var Token) = proc getSymbol(c: var PegLexer, tok: var Token) = var pos = c.bufpos var buf = c.buf - while true: + while pos < c.buf.len: add(tok.literal, buf[pos]) inc(pos) - if buf[pos] notin strutils.IdentChars: break + if pos < buf.len and buf[pos] notin strutils.IdentChars: break c.bufpos = pos tok.kind = tkIdentifier proc getBuiltin(c: var PegLexer, tok: var Token) = - if c.buf[c.bufpos+1] in strutils.Letters: + if c.bufpos+1 < c.buf.len and c.buf[c.bufpos+1] in strutils.Letters: inc(c.bufpos) getSymbol(c, tok) tok.kind = tkBuiltin @@ -1339,10 +1681,12 @@ proc getTok(c: var PegLexer, tok: var Token) = tok.modifier = modNone setLen(tok.literal, 0) skip(c) + case c.buf[c.bufpos] of '{': inc(c.bufpos) - if c.buf[c.bufpos] == '@' and c.buf[c.bufpos+1] == '}': + if c.buf[c.bufpos] == '@' and c.bufpos+2 < c.buf.len and + c.buf[c.bufpos+1] == '}': tok.kind = tkCurlyAt inc(c.bufpos, 2) add(tok.literal, "{@}") @@ -1375,13 +1719,11 @@ proc getTok(c: var PegLexer, tok: var Token) = getBuiltin(c, tok) of '\'', '"': getString(c, tok) of '$': getDollar(c, tok) - of '\0': - tok.kind = tkEof - tok.literal = "[EOF]" of 'a'..'z', 'A'..'Z', '\128'..'\255': getSymbol(c, tok) if c.buf[c.bufpos] in {'\'', '"'} or - c.buf[c.bufpos] == '$' and c.buf[c.bufpos+1] in {'0'..'9'}: + c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and + c.buf[c.bufpos+1] in {'0'..'9'}: case tok.literal of "i": tok.modifier = modIgnoreCase of "y": tok.modifier = modIgnoreStyle @@ -1402,7 +1744,7 @@ proc getTok(c: var PegLexer, tok: var Token) = inc(c.bufpos) add(tok.literal, '+') of '<': - if c.buf[c.bufpos+1] == '-': + if c.bufpos+2 < c.buf.len and c.buf[c.bufpos+1] == '-': inc(c.bufpos, 2) tok.kind = tkArrow add(tok.literal, "<-") @@ -1437,14 +1779,17 @@ proc getTok(c: var PegLexer, tok: var Token) = inc(c.bufpos) add(tok.literal, '^') else: + if c.bufpos >= c.buf.len: + tok.kind = tkEof + tok.literal = "[EOF]" add(tok.literal, c.buf[c.bufpos]) inc(c.bufpos) proc arrowIsNextTok(c: PegLexer): bool = # the only look ahead we need var pos = c.bufpos - while c.buf[pos] in {'\t', ' '}: inc(pos) - result = c.buf[pos] == '<' and c.buf[pos+1] == '-' + while pos < c.buf.len and c.buf[pos] in {'\t', ' '}: inc(pos) + result = c.buf[pos] == '<' and (pos+1 < c.buf.len) and c.buf[pos+1] == '-' # ----------------------------- parser ---------------------------------------- @@ -1467,7 +1812,7 @@ proc pegError(p: PegParser, msg: string, line = -1, col = -1) = proc getTok(p: var PegParser) = getTok(p, p.tok) - if p.tok.kind == tkInvalid: pegError(p, "invalid token") + if p.tok.kind == tkInvalid: pegError(p, "'" & p.tok.literal & "' is invalid token") proc eat(p: var PegParser, kind: TokKind) = if p.tok.kind == kind: getTok(p) @@ -1817,7 +2162,7 @@ when isMainModule: assert match("prefix/start", peg"^start$", 7) if "foo" =~ peg"{'a'}?.*": - assert matches[0] == nil + assert matches[0].len == 0 else: assert false if "foo" =~ peg"{''}.*": |