diff options
Diffstat (limited to 'lib/pure/pegs.nim')
-rw-r--r--[-rwxr-xr-x] | lib/pure/pegs.nim | 2145 |
1 files changed, 1230 insertions, 915 deletions
diff --git a/lib/pure/pegs.nim b/lib/pure/pegs.nim index 4e31ffc0c..2969fd6d7 100755..100644 --- a/lib/pure/pegs.nim +++ b/lib/pure/pegs.nim @@ -1,6 +1,6 @@ # # -# Nimrod's Runtime Library +# Nim's Runtime Library # (c) Copyright 2012 Andreas Rumpf # # See the file "copying.txt", included in this @@ -12,134 +12,183 @@ ## Matching performance is hopefully competitive with optimized regular ## expression engines. ## -## .. include:: ../doc/pegdocs.txt +## .. include:: ../../doc/pegdocs.txt ## include "system/inclrtl" +when defined(nimPreviewSlimSystem): + import std/[syncio, assertions] const useUnicode = true ## change this to deactivate proper UTF-8 support -import - strutils +import std/[strutils, macros] +import std/private/decode_helpers when useUnicode: - import unicode + import std/unicode + export unicode.`==` const InlineThreshold = 5 ## number of leaves; -1 to disable inlining - MaxSubpatterns* = 10 ## defines the maximum number of subpatterns that - ## can be captured. More subpatterns cannot be captured! + MaxSubpatterns* = 20 ## defines the maximum number of subpatterns that + ## can be captured. More subpatterns cannot be captured! type - TPegKind = enum + PegKind* = enum pkEmpty, - pkAny, ## any character (.) - pkAnyRune, ## any Unicode character (_) - pkNewLine, ## CR-LF, LF, CR - pkLetter, ## Unicode letter - pkLower, ## Unicode lower case letter - pkUpper, ## Unicode upper case letter - pkTitle, ## Unicode title character - pkWhitespace, ## Unicode whitespace character + pkAny, ## any character (.) + pkAnyRune, ## any Unicode character (_) + pkNewLine, ## CR-LF, LF, CR + pkLetter, ## Unicode letter + pkLower, ## Unicode lower case letter + pkUpper, ## Unicode upper case letter + pkTitle, ## Unicode title character + pkWhitespace, ## Unicode whitespace character pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, - pkChar, ## single character to match + pkChar, ## single character to match pkCharChoice, pkNonTerminal, - pkSequence, ## a b c ... --> Internal DSL: peg(a, b, c) - pkOrderedChoice, ## a / b / ... --> Internal DSL: a / b or /[a, b, c] - pkGreedyRep, ## a* --> Internal DSL: *a - ## a+ --> (a a*) - pkGreedyRepChar, ## x* where x is a single character (superop) - pkGreedyRepSet, ## [set]* (superop) - pkGreedyAny, ## .* or _* (superop) - pkOption, ## a? --> Internal DSL: ?a - pkAndPredicate, ## &a --> Internal DSL: &a - pkNotPredicate, ## !a --> Internal DSL: !a - pkCapture, ## {a} --> Internal DSL: capture(a) - pkBackRef, ## $i --> Internal DSL: backref(i) + pkSequence, ## a b c ... --> Internal DSL: peg(a, b, c) + pkOrderedChoice, ## a / b / ... --> Internal DSL: a / b or /[a, b, c] + pkGreedyRep, ## a* --> Internal DSL: *a + ## a+ --> (a a*) + pkGreedyRepChar, ## x* where x is a single character (superop) + pkGreedyRepSet, ## [set]* (superop) + pkGreedyAny, ## .* or _* (superop) + pkOption, ## a? --> Internal DSL: ?a + pkAndPredicate, ## &a --> Internal DSL: &a + pkNotPredicate, ## !a --> Internal DSL: !a + pkCapture, ## {a} --> Internal DSL: capture(a) + pkBackRef, ## $i --> Internal DSL: backref(i) pkBackRefIgnoreCase, pkBackRefIgnoreStyle, - pkSearch, ## @a --> Internal DSL: !*a - pkCapturedSearch, ## {@} a --> Internal DSL: !*\a - pkRule, ## a <- b - pkList, ## a, b - pkStartAnchor ## ^ --> Internal DSL: startAnchor() - TNonTerminalFlag = enum + pkSearch, ## @a --> Internal DSL: !*a + pkCapturedSearch, ## {@} a --> Internal DSL: !*\a + pkRule, ## a <- b + pkList, ## a, b + pkStartAnchor ## ^ --> Internal DSL: startAnchor() + NonTerminalFlag* = enum ntDeclared, ntUsed - TNonTerminal {.final.} = object ## represents a non terminal symbol - name: string ## the name of the symbol - line: int ## line the symbol has been declared/used in - col: int ## column the symbol has been declared/used in - flags: set[TNonTerminalFlag] ## the nonterminal's flags - rule: TNode ## the rule that the symbol refers to - TNode {.final, shallow.} = object - case kind: TPegKind + NonTerminalObj = object ## represents a non terminal symbol + name: string ## the name of the symbol + line: int ## line the symbol has been declared/used in + col: int ## column the symbol has been declared/used in + flags: set[NonTerminalFlag] ## the nonterminal's flags + rule: Peg ## the rule that the symbol refers to + Peg* {.shallow.} = object ## type that represents a PEG + case kind: PegKind of pkEmpty..pkWhitespace: nil of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string of pkChar, pkGreedyRepChar: ch: char of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char] - of pkNonTerminal: nt: PNonTerminal - of pkBackRef..pkBackRefIgnoreStyle: index: range[1..MaxSubpatterns] - else: sons: seq[TNode] - PNonTerminal* = ref TNonTerminal - - TPeg* = TNode ## type that represents a PEG - -proc term*(t: string): TPeg {.nosideEffect, rtl, extern: "npegs$1Str".} = + of pkNonTerminal: nt: NonTerminal + of pkBackRef..pkBackRefIgnoreStyle: index: range[-MaxSubpatterns..MaxSubpatterns-1] + else: sons: seq[Peg] + NonTerminal* = ref NonTerminalObj + +func kind*(p: Peg): PegKind = p.kind + ## Returns the *PegKind* of a given *Peg* object. + +func term*(p: Peg): string = p.term + ## Returns the *string* representation of a given *Peg* variant object + ## where present. + +func ch*(p: Peg): char = p.ch + ## Returns the *char* representation of a given *Peg* variant object + ## where present. + +func charChoice*(p: Peg): ref set[char] = p.charChoice + ## Returns the *charChoice* field of a given *Peg* variant object + ## where present. + +func nt*(p: Peg): NonTerminal = p.nt + ## Returns the *NonTerminal* object of a given *Peg* variant object + ## where present. + +func index*(p: Peg): range[-MaxSubpatterns..MaxSubpatterns-1] = p.index + ## Returns the back-reference index of a captured sub-pattern in the + ## *Captures* object for a given *Peg* variant object where present. + +iterator items*(p: Peg): Peg {.inline.} = + ## Yields the child nodes of a *Peg* variant object where present. + for s in p.sons: + yield s + +iterator pairs*(p: Peg): (int, Peg) {.inline.} = + ## Yields the indices and child nodes of a *Peg* variant object where present. + for i in 0 ..< p.sons.len: + yield (i, p.sons[i]) + +func name*(nt: NonTerminal): string = nt.name + ## Gets the name of the symbol represented by the parent *Peg* object variant + ## of a given *NonTerminal*. + +func line*(nt: NonTerminal): int = nt.line + ## Gets the line number of the definition of the parent *Peg* object variant + ## of a given *NonTerminal*. + +func col*(nt: NonTerminal): int = nt.col + ## Gets the column number of the definition of the parent *Peg* object variant + ## of a given *NonTerminal*. + +func flags*(nt: NonTerminal): set[NonTerminalFlag] = nt.flags + ## Gets the *NonTerminalFlag*-typed flags field of the parent *Peg* variant + ## object of a given *NonTerminal*. + +func rule*(nt: NonTerminal): Peg = nt.rule + ## Gets the *Peg* object representing the rule definition of the parent *Peg* + ## object variant of a given *NonTerminal*. + +func term*(t: string): Peg {.rtl, extern: "npegs$1Str".} = ## constructs a PEG from a terminal string - if t.len != 1: - result.kind = pkTerminal - result.term = t + if t.len != 1: + result = Peg(kind: pkTerminal, term: t) else: - result.kind = pkChar - result.ch = t[0] + result = Peg(kind: pkChar, ch: t[0]) -proc termIgnoreCase*(t: string): TPeg {. - nosideEffect, rtl, extern: "npegs$1".} = +func termIgnoreCase*(t: string): Peg {. + rtl, extern: "npegs$1".} = ## constructs a PEG from a terminal string; ignore case for matching - result.kind = pkTerminalIgnoreCase - result.term = t + result = Peg(kind: pkTerminalIgnoreCase, term: t) -proc termIgnoreStyle*(t: string): TPeg {. - nosideEffect, rtl, extern: "npegs$1".} = +func termIgnoreStyle*(t: string): Peg {. + rtl, extern: "npegs$1".} = ## constructs a PEG from a terminal string; ignore style for matching - result.kind = pkTerminalIgnoreStyle - result.term = t + result = Peg(kind: pkTerminalIgnoreStyle, term: t) -proc term*(t: char): TPeg {.nosideEffect, rtl, extern: "npegs$1Char".} = +func term*(t: char): Peg {.rtl, extern: "npegs$1Char".} = ## constructs a PEG from a terminal char assert t != '\0' - result.kind = pkChar - result.ch = t - -proc charSet*(s: set[char]): TPeg {.nosideEffect, rtl, extern: "npegs$1".} = + result = Peg(kind: pkChar, ch: t) + +func charSet*(s: set[char]): Peg {.rtl, extern: "npegs$1".} = ## constructs a PEG from a character set `s` assert '\0' notin s - result.kind = pkCharChoice - new(result.charChoice) - result.charChoice[] = s + result = Peg(kind: pkCharChoice) + {.cast(noSideEffect).}: + new(result.charChoice) + result.charChoice[] = s -proc len(a: TPeg): int {.inline.} = return a.sons.len -proc add(d: var TPeg, s: TPeg) {.inline.} = add(d.sons, s) +func len(a: Peg): int {.inline.} = return a.sons.len +func add(d: var Peg, s: Peg) {.inline.} = add(d.sons, s) -proc addChoice(dest: var TPeg, elem: TPeg) = +func addChoice(dest: var Peg, elem: Peg) = var L = dest.len-1 - if L >= 0 and dest.sons[L].kind == pkCharChoice: + if L >= 0 and dest.sons[L].kind == pkCharChoice: # caution! Do not introduce false aliasing here! case elem.kind of pkCharChoice: dest.sons[L] = charSet(dest.sons[L].charChoice[] + elem.charChoice[]) - of pkChar: + of pkChar: dest.sons[L] = charSet(dest.sons[L].charChoice[] + {elem.ch}) else: add(dest, elem) else: add(dest, elem) -template multipleOp(k: TPegKind, localOpt: expr) = - result.kind = k - result.sons = @[] +template multipleOp(k: PegKind, localOpt: untyped) = + result = Peg(kind: k, sons: @[]) for x in items(a): if x.kind == k: for y in items(x.sons): @@ -149,29 +198,29 @@ template multipleOp(k: TPegKind, localOpt: expr) = if result.len == 1: result = result.sons[0] -proc `/`*(a: varargs[TPeg]): TPeg {. - nosideEffect, rtl, extern: "npegsOrderedChoice".} = +func `/`*(a: varargs[Peg]): Peg {. + rtl, extern: "npegsOrderedChoice".} = ## constructs an ordered choice with the PEGs in `a` multipleOp(pkOrderedChoice, addChoice) -proc addSequence(dest: var TPeg, elem: TPeg) = +func addSequence(dest: var Peg, elem: Peg) = var L = dest.len-1 - if L >= 0 and dest.sons[L].kind == pkTerminal: + if L >= 0 and dest.sons[L].kind == pkTerminal: # caution! Do not introduce false aliasing here! case elem.kind - of pkTerminal: + of pkTerminal: dest.sons[L] = term(dest.sons[L].term & elem.term) - of pkChar: + of pkChar: dest.sons[L] = term(dest.sons[L].term & elem.ch) else: add(dest, elem) else: add(dest, elem) -proc sequence*(a: varargs[TPeg]): TPeg {. - nosideEffect, rtl, extern: "npegs$1".} = +func sequence*(a: varargs[Peg]): Peg {. + rtl, extern: "npegs$1".} = ## constructs a sequence with all the PEGs from `a` multipleOp(pkSequence, addSequence) - -proc `?`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsOptional".} = + +func `?`*(a: Peg): Peg {.rtl, extern: "npegsOptional".} = ## constructs an optional for the PEG `a` if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar, pkGreedyRepSet}: @@ -179,137 +228,116 @@ proc `?`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsOptional".} = # a? ? --> a? result = a else: - result.kind = pkOption - result.sons = @[a] + result = Peg(kind: pkOption, sons: @[a]) -proc `*`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsGreedyRep".} = +func `*`*(a: Peg): Peg {.rtl, extern: "npegsGreedyRep".} = ## constructs a "greedy repetition" for the PEG `a` case a.kind of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption: assert false # produces endless loop! of pkChar: - result.kind = pkGreedyRepChar - result.ch = a.ch + result = Peg(kind: pkGreedyRepChar, ch: a.ch) of pkCharChoice: - result.kind = pkGreedyRepSet - result.charChoice = a.charChoice # copying a reference suffices! + result = Peg(kind: pkGreedyRepSet, charChoice: a.charChoice) of pkAny, pkAnyRune: - result.kind = pkGreedyAny + result = Peg(kind: pkGreedyAny) else: - result.kind = pkGreedyRep - result.sons = @[a] + result = Peg(kind: pkGreedyRep, sons: @[a]) -proc `!*`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsSearch".} = +func `!*`*(a: Peg): Peg {.rtl, extern: "npegsSearch".} = ## constructs a "search" for the PEG `a` - result.kind = pkSearch - result.sons = @[a] + result = Peg(kind: pkSearch, sons: @[a]) -proc `!*\`*(a: TPeg): TPeg {.noSideEffect, rtl, +func `!*\`*(a: Peg): Peg {.rtl, extern: "npgegsCapturedSearch".} = ## constructs a "captured search" for the PEG `a` - result.kind = pkCapturedSearch - result.sons = @[a] - -when false: - proc contains(a: TPeg, k: TPegKind): bool = - if a.kind == k: return true - case a.kind - of pkEmpty, pkAny, pkAnyRune, pkGreedyAny, pkNewLine, pkTerminal, - pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, pkGreedyRepChar, - pkCharChoice, pkGreedyRepSet: nil - of pkNonTerminal: return true - else: - for i in 0..a.sons.len-1: - if contains(a.sons[i], k): return true + result = Peg(kind: pkCapturedSearch, sons: @[a]) -proc `+`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsGreedyPosRep".} = +func `+`*(a: Peg): Peg {.rtl, extern: "npegsGreedyPosRep".} = ## constructs a "greedy positive repetition" with the PEG `a` return sequence(a, *a) - -proc `&`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsAndPredicate".} = + +func `&`*(a: Peg): Peg {.rtl, extern: "npegsAndPredicate".} = ## constructs an "and predicate" with the PEG `a` - result.kind = pkAndPredicate - result.sons = @[a] + result = Peg(kind: pkAndPredicate, sons: @[a]) -proc `!`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsNotPredicate".} = +func `!`*(a: Peg): Peg {.rtl, extern: "npegsNotPredicate".} = ## constructs a "not predicate" with the PEG `a` - result.kind = pkNotPredicate - result.sons = @[a] + result = Peg(kind: pkNotPredicate, sons: @[a]) -proc any*: TPeg {.inline.} = +func any*: Peg {.inline.} = ## constructs the PEG `any character`:idx: (``.``) - result.kind = pkAny + result = Peg(kind: pkAny) -proc anyRune*: TPeg {.inline.} = +func anyRune*: Peg {.inline.} = ## constructs the PEG `any rune`:idx: (``_``) - result.kind = pkAnyRune + result = Peg(kind: pkAnyRune) -proc newLine*: TPeg {.inline.} = +func newLine*: Peg {.inline.} = ## constructs the PEG `newline`:idx: (``\n``) - result.kind = pkNewline + result = Peg(kind: pkNewLine) -proc UnicodeLetter*: TPeg {.inline.} = +func unicodeLetter*: Peg {.inline.} = ## constructs the PEG ``\letter`` which matches any Unicode letter. - result.kind = pkLetter - -proc UnicodeLower*: TPeg {.inline.} = + result = Peg(kind: pkLetter) + +func unicodeLower*: Peg {.inline.} = ## constructs the PEG ``\lower`` which matches any Unicode lowercase letter. - result.kind = pkLower + result = Peg(kind: pkLower) -proc UnicodeUpper*: TPeg {.inline.} = +func unicodeUpper*: Peg {.inline.} = ## constructs the PEG ``\upper`` which matches any Unicode uppercase letter. - result.kind = pkUpper - -proc UnicodeTitle*: TPeg {.inline.} = + result = Peg(kind: pkUpper) + +func unicodeTitle*: Peg {.inline.} = ## constructs the PEG ``\title`` which matches any Unicode title letter. - result.kind = pkTitle + result = Peg(kind: pkTitle) -proc UnicodeWhitespace*: TPeg {.inline.} = - ## constructs the PEG ``\white`` which matches any Unicode +func unicodeWhitespace*: Peg {.inline.} = + ## constructs the PEG ``\white`` which matches any Unicode ## whitespace character. - result.kind = pkWhitespace + result = Peg(kind: pkWhitespace) -proc startAnchor*: TPeg {.inline.} = - ## constructs the PEG ``^`` which matches the start of the input. - result.kind = pkStartAnchor +func startAnchor*: Peg {.inline.} = + ## constructs the PEG ``^`` which matches the start of the input. + result = Peg(kind: pkStartAnchor) -proc endAnchor*: TPeg {.inline.} = - ## constructs the PEG ``$`` which matches the end of the input. +func endAnchor*: Peg {.inline.} = + ## constructs the PEG ``$`` which matches the end of the input. result = !any() -proc capture*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsCapture".} = +func capture*(a: Peg = Peg(kind: pkEmpty)): Peg {.rtl, extern: "npegsCapture".} = ## constructs a capture with the PEG `a` - result.kind = pkCapture - result.sons = @[a] + result = Peg(kind: pkCapture, sons: @[a]) -proc backref*(index: range[1..MaxSubPatterns]): TPeg {. - nosideEffect, rtl, extern: "npegs$1".} = +func backref*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {. + rtl, extern: "npegs$1".} = ## constructs a back reference of the given `index`. `index` starts counting - ## from 1. - result.kind = pkBackRef - result.index = index-1 + ## from 1. `reverse` specifies whether indexing starts from the end of the + ## capture list. + result = Peg(kind: pkBackRef, index: (if reverse: -index else: index - 1)) -proc backrefIgnoreCase*(index: range[1..MaxSubPatterns]): TPeg {. - nosideEffect, rtl, extern: "npegs$1".} = +func backrefIgnoreCase*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {. + rtl, extern: "npegs$1".} = ## constructs a back reference of the given `index`. `index` starts counting - ## from 1. Ignores case for matching. - result.kind = pkBackRefIgnoreCase - result.index = index-1 + ## from 1. `reverse` specifies whether indexing starts from the end of the + ## capture list. Ignores case for matching. + result = Peg(kind: pkBackRefIgnoreCase, index: (if reverse: -index else: index - 1)) -proc backrefIgnoreStyle*(index: range[1..MaxSubPatterns]): TPeg {. - nosideEffect, rtl, extern: "npegs$1".}= +func backrefIgnoreStyle*(index: range[1..MaxSubpatterns], reverse: bool = false): Peg {. + rtl, extern: "npegs$1".} = ## constructs a back reference of the given `index`. `index` starts counting - ## from 1. Ignores style for matching. - result.kind = pkBackRefIgnoreStyle - result.index = index-1 + ## from 1. `reverse` specifies whether indexing starts from the end of the + ## capture list. Ignores style for matching. + result = Peg(kind: pkBackRefIgnoreStyle, index: (if reverse: -index else: index - 1)) -proc spaceCost(n: TPeg): int = +func spaceCost(n: Peg): int = case n.kind - of pkEmpty: nil + of pkEmpty: discard of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, - pkGreedyRepChar, pkCharChoice, pkGreedyRepSet, - pkAny..pkWhitespace, pkGreedyAny: + pkGreedyRepChar, pkCharChoice, pkGreedyRepSet, + pkAny..pkWhitespace, pkGreedyAny, pkBackRef..pkBackRefIgnoreStyle: result = 1 of pkNonTerminal: # we cannot inline a rule with a non-terminal @@ -319,57 +347,53 @@ proc spaceCost(n: TPeg): int = inc(result, spaceCost(n.sons[i])) if result >= InlineThreshold: break -proc nonterminal*(n: PNonTerminal): TPeg {. - nosideEffect, rtl, extern: "npegs$1".} = +func nonterminal*(n: NonTerminal): Peg {. + rtl, extern: "npegs$1".} = ## constructs a PEG that consists of the nonterminal symbol assert n != nil if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold: when false: echo "inlining symbol: ", n.name result = n.rule # inlining of rule enables better optimizations else: - result.kind = pkNonTerminal - result.nt = n + result = Peg(kind: pkNonTerminal, nt: n) -proc newNonTerminal*(name: string, line, column: int): PNonTerminal {. - nosideEffect, rtl, extern: "npegs$1".} = +func newNonTerminal*(name: string, line, column: int): NonTerminal {. + rtl, extern: "npegs$1".} = ## constructs a nonterminal symbol - new(result) - result.name = name - result.line = line - result.col = column + result = NonTerminal(name: name, line: line, col: column) -template letters*: expr = +template letters*: Peg = ## expands to ``charset({'A'..'Z', 'a'..'z'})`` - charset({'A'..'Z', 'a'..'z'}) - -template digits*: expr = + charSet({'A'..'Z', 'a'..'z'}) + +template digits*: Peg = ## expands to ``charset({'0'..'9'})`` - charset({'0'..'9'}) + charSet({'0'..'9'}) -template whitespace*: expr = +template whitespace*: Peg = ## expands to ``charset({' ', '\9'..'\13'})`` - charset({' ', '\9'..'\13'}) - -template identChars*: expr = + charSet({' ', '\9'..'\13'}) + +template identChars*: Peg = ## expands to ``charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})`` - charset({'a'..'z', 'A'..'Z', '0'..'9', '_'}) - -template identStartChars*: expr = + charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'}) + +template identStartChars*: Peg = ## expands to ``charset({'A'..'Z', 'a'..'z', '_'})`` - charset({'a'..'z', 'A'..'Z', '_'}) + charSet({'a'..'z', 'A'..'Z', '_'}) -template ident*: expr = +template ident*: Peg = ## same as ``[a-zA-Z_][a-zA-z_0-9]*``; standard identifier - sequence(charset({'a'..'z', 'A'..'Z', '_'}), - *charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})) - -template natural*: expr = + sequence(charSet({'a'..'z', 'A'..'Z', '_'}), + *charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'})) + +template natural*: Peg = ## same as ``\d+`` +digits # ------------------------- debugging ----------------------------------------- -proc esc(c: char, reserved = {'\0'..'\255'}): string = +func esc(c: char, reserved = {'\0'..'\255'}): string = case c of '\b': result = "\\b" of '\t': result = "\\t" @@ -381,41 +405,41 @@ proc esc(c: char, reserved = {'\0'..'\255'}): string = of '\a': result = "\\a" of '\\': result = "\\\\" of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c - elif c < ' ' or c >= '\128': result = '\\' & $ord(c) + elif c < ' ' or c >= '\127': result = '\\' & $ord(c) elif c in reserved: result = '\\' & c else: result = $c - -proc singleQuoteEsc(c: Char): string = return "'" & esc(c, {'\''}) & "'" -proc singleQuoteEsc(str: string): string = +func singleQuoteEsc(c: char): string = return "'" & esc(c, {'\''}) & "'" + +func singleQuoteEsc(str: string): string = result = "'" for c in items(str): add result, esc(c, {'\''}) add result, '\'' - -proc charSetEscAux(cc: set[char]): string = + +func charSetEscAux(cc: set[char]): string = const reserved = {'^', '-', ']'} result = "" var c1 = 0 - while c1 <= 0xff: - if chr(c1) in cc: + while c1 <= 0xff: + if chr(c1) in cc: var c2 = c1 while c2 < 0xff and chr(succ(c2)) in cc: inc(c2) - if c1 == c2: + if c1 == c2: add result, esc(chr(c1), reserved) - elif c2 == succ(c1): + elif c2 == succ(c1): add result, esc(chr(c1), reserved) & esc(chr(c2), reserved) - else: + else: add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved) c1 = c2 inc(c1) - -proc CharSetEsc(cc: set[char]): string = - if card(cc) >= 128+64: - result = "[^" & CharSetEscAux({'\1'..'\xFF'} - cc) & ']' - else: - result = '[' & CharSetEscAux(cc) & ']' - -proc toStrAux(r: TPeg, res: var string) = + +func charSetEsc(cc: set[char]): string = + if card(cc) >= 128+64: + result = "[^" & charSetEscAux({'\1'..'\xFF'} - cc) & ']' + else: + result = '[' & charSetEscAux(cc) & ']' + +func toStrAux(r: Peg, res: var string) = case r.kind of pkEmpty: add(res, "()") of pkAny: add(res, '.') @@ -426,7 +450,7 @@ proc toStrAux(r: TPeg, res: var string) = of pkTitle: add(res, "\\title") of pkWhitespace: add(res, "\\white") - of pkNewline: add(res, "\\n") + of pkNewLine: add(res, "\\n") of pkTerminal: add(res, singleQuoteEsc(r.term)) of pkTerminalIgnoreCase: add(res, 'i') @@ -479,29 +503,29 @@ proc toStrAux(r: TPeg, res: var string) = toStrAux(r.sons[0], res) of pkCapture: add(res, '{') - toStrAux(r.sons[0], res) + toStrAux(r.sons[0], res) add(res, '}') - of pkBackRef: + of pkBackRef: add(res, '$') add(res, $r.index) - of pkBackRefIgnoreCase: + of pkBackRefIgnoreCase: add(res, "i$") add(res, $r.index) - of pkBackRefIgnoreStyle: + of pkBackRefIgnoreStyle: add(res, "y$") add(res, $r.index) of pkRule: - toStrAux(r.sons[0], res) + toStrAux(r.sons[0], res) add(res, " <- ") toStrAux(r.sons[1], res) of pkList: for i in 0 .. high(r.sons): toStrAux(r.sons[i], res) - add(res, "\n") + add(res, "\n") of pkStartAnchor: add(res, '^') -proc `$` *(r: TPeg): string {.nosideEffect, rtl, extern: "npegsToString".} = +func `$` *(r: Peg): string {.rtl, extern: "npegsToString".} = ## converts a PEG to its string representation result = "" toStrAux(r, result) @@ -509,289 +533,595 @@ proc `$` *(r: TPeg): string {.nosideEffect, rtl, extern: "npegsToString".} = # --------------------- core engine ------------------------------------------- type - TCaptures* {.final.} = object ## contains the captured substrings. - matches: array[0..maxSubpatterns-1, tuple[first, last: int]] + Captures* = object ## contains the captured substrings. + matches: array[0..MaxSubpatterns-1, tuple[first, last: int]] ml: int origStart: int -proc bounds*(c: TCaptures, - i: range[0..maxSubpatterns-1]): tuple[first, last: int] = +func bounds*(c: Captures, + i: range[0..MaxSubpatterns-1]): tuple[first, last: int] = ## returns the bounds ``[first..last]`` of the `i`'th capture. result = c.matches[i] when not useUnicode: type - TRune = char - template fastRuneAt(s, i, ch: expr) = + Rune = char + template fastRuneAt(s, i, ch) = ch = s[i] inc(i) - template runeLenAt(s, i: expr): expr = 1 - - proc isAlpha(a: char): bool {.inline.} = return a in {'a'..'z','A'..'Z'} - proc isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'} - proc isLower(a: char): bool {.inline.} = return a in {'a'..'z'} - proc isTitle(a: char): bool {.inline.} = return false - proc isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'} - -proc rawMatch*(s: string, p: TPeg, start: int, c: var TCaptures): int {. - nosideEffect, rtl, extern: "npegs$1".} = - ## low-level matching proc that implements the PEG interpreter. Use this - ## for maximum efficiency (every other PEG operation ends up calling this - ## proc). - ## Returns -1 if it does not match, else the length of the match - case p.kind - of pkEmpty: result = 0 # match of length 0 - of pkAny: - if s[start] != '\0': result = 1 - else: result = -1 - of pkAnyRune: - if s[start] != '\0': - result = runeLenAt(s, start) - else: - result = -1 - of pkLetter: - if s[start] != '\0': - var a: TRune - result = start - fastRuneAt(s, result, a) - if isAlpha(a): dec(result, start) + template runeLenAt(s, i): untyped = 1 + + func isAlpha(a: char): bool {.inline.} = return a in {'a'..'z', 'A'..'Z'} + func isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'} + func isLower(a: char): bool {.inline.} = return a in {'a'..'z'} + func isTitle(a: char): bool {.inline.} = return false + func isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'} + +template matchOrParse(mopProc: untyped) = + # Used to make the main matcher proc *rawMatch* as well as event parser + # procs. For the former, *enter* and *leave* event handler code generators + # are provided which just return *discard*. + + proc mopProc(s: string, p: Peg, start: int, c: var Captures): int {.gcsafe, raises: [].} = + proc matchBackRef(s: string, p: Peg, start: int, c: var Captures): int = + # Parse handler code must run in an *of* clause of its own for each + # *PegKind*, so we encapsulate the identical clause body for + # *pkBackRef..pkBackRefIgnoreStyle* here. + var index = p.index + if index < 0: index.inc(c.ml) + if index < 0 or index >= c.ml: return -1 + var (a, b) = c.matches[index] + var n: Peg + case p.kind + of pkBackRef: + n = Peg(kind: pkTerminal, term: s.substr(a, b)) + of pkBackRefIgnoreStyle: + n = Peg(kind: pkTerminalIgnoreStyle, term: s.substr(a, b)) + of pkBackRefIgnoreCase: + n = Peg(kind: pkTerminalIgnoreCase, term: s.substr(a, b)) + else: assert(false, "impossible case") + mopProc(s, n, start, c) + + case p.kind + of pkEmpty: + enter(pkEmpty, s, p, start) + result = 0 # match of length 0 + leave(pkEmpty, s, p, start, result) + of pkAny: + enter(pkAny, s, p, start) + if start < s.len: result = 1 else: result = -1 - else: - result = -1 - of pkLower: - if s[start] != '\0': - var a: TRune - result = start - fastRuneAt(s, result, a) - if isLower(a): dec(result, start) + leave(pkAny, s, p, start, result) + of pkAnyRune: + enter(pkAnyRune, s, p, start) + if start < s.len: + result = runeLenAt(s, start) + else: + result = -1 + leave(pkAnyRune, s, p, start, result) + of pkLetter: + enter(pkLetter, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isAlpha(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkLetter, s, p, start, result) + of pkLower: + enter(pkLower, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isLower(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkLower, s, p, start, result) + of pkUpper: + enter(pkUpper, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isUpper(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkUpper, s, p, start, result) + of pkTitle: + enter(pkTitle, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isTitle(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkTitle, s, p, start, result) + of pkWhitespace: + enter(pkWhitespace, s, p, start) + if start < s.len: + var a: Rune + result = start + fastRuneAt(s, result, a) + if isWhiteSpace(a): dec(result, start) + else: result = -1 + else: + result = -1 + leave(pkWhitespace, s, p, start, result) + of pkGreedyAny: + enter(pkGreedyAny, s, p, start) + result = len(s) - start + leave(pkGreedyAny, s, p, start, result) + of pkNewLine: + enter(pkNewLine, s, p, start) + if start < s.len and s[start] == '\L': result = 1 + elif start < s.len and s[start] == '\C': + if start+1 < s.len and s[start+1] == '\L': result = 2 + else: result = 1 else: result = -1 - else: - result = -1 - of pkUpper: - if s[start] != '\0': - var a: TRune + leave(pkNewLine, s, p, start, result) + of pkTerminal: + enter(pkTerminal, s, p, start) + result = len(p.term) + for i in 0..result-1: + if start+i >= s.len or p.term[i] != s[start+i]: + result = -1 + break + leave(pkTerminal, s, p, start, result) + of pkTerminalIgnoreCase: + enter(pkTerminalIgnoreCase, s, p, start) + var + i = 0 + a, b: Rune result = start - fastRuneAt(s, result, a) - if isUpper(a): dec(result, start) - else: result = -1 - else: - result = -1 - of pkTitle: - if s[start] != '\0': - var a: TRune + while i < len(p.term): + if result >= s.len: + result = -1 + break + fastRuneAt(p.term, i, a) + fastRuneAt(s, result, b) + if toLower(a) != toLower(b): + result = -1 + break + dec(result, start) + leave(pkTerminalIgnoreCase, s, p, start, result) + of pkTerminalIgnoreStyle: + enter(pkTerminalIgnoreStyle, s, p, start) + var + i = 0 + a, b: Rune result = start - fastRuneAt(s, result, a) - if isTitle(a): dec(result, start) + while i < len(p.term): + while i < len(p.term): + fastRuneAt(p.term, i, a) + if a != Rune('_'): break + while result < s.len: + fastRuneAt(s, result, b) + if b != Rune('_'): break + if result >= s.len: + if i >= p.term.len: break + else: + result = -1 + break + elif toLower(a) != toLower(b): + result = -1 + break + dec(result, start) + leave(pkTerminalIgnoreStyle, s, p, start, result) + of pkChar: + enter(pkChar, s, p, start) + if start < s.len and p.ch == s[start]: result = 1 else: result = -1 - else: - result = -1 - of pkWhitespace: - if s[start] != '\0': - var a: TRune - result = start - fastRuneAt(s, result, a) - if isWhitespace(a): dec(result, start) + leave(pkChar, s, p, start, result) + of pkCharChoice: + enter(pkCharChoice, s, p, start) + if start < s.len and contains(p.charChoice[], s[start]): result = 1 else: result = -1 - else: - result = -1 - of pkGreedyAny: - result = len(s) - start - of pkNewLine: - if s[start] == '\L': result = 1 - elif s[start] == '\C': - if s[start+1] == '\L': result = 2 - else: result = 1 - else: result = -1 - of pkTerminal: - result = len(p.term) - for i in 0..result-1: - if p.term[i] != s[start+i]: - result = -1 - break - of pkTerminalIgnoreCase: - var - i = 0 - a, b: TRune - result = start - while i < len(p.term): - fastRuneAt(p.term, i, a) - fastRuneAt(s, result, b) - if toLower(a) != toLower(b): - result = -1 - break - dec(result, start) - of pkTerminalIgnoreStyle: - var - i = 0 - a, b: TRune - result = start - while i < len(p.term): - while true: - fastRuneAt(p.term, i, a) - if a != TRune('_'): break - while true: - fastRuneAt(s, result, b) - if b != TRune('_'): break - if toLower(a) != toLower(b): - result = -1 - break - dec(result, start) - of pkChar: - if p.ch == s[start]: result = 1 - else: result = -1 - of pkCharChoice: - if contains(p.charChoice[], s[start]): result = 1 - else: result = -1 - of pkNonTerminal: - var oldMl = c.ml - when false: echo "enter: ", p.nt.name - result = rawMatch(s, p.nt.rule, start, c) - when false: echo "leave: ", p.nt.name - if result < 0: c.ml = oldMl - of pkSequence: - var oldMl = c.ml - result = 0 - for i in 0..high(p.sons): - var x = rawMatch(s, p.sons[i], start+result, c) - if x < 0: + leave(pkCharChoice, s, p, start, result) + of pkNonTerminal: + enter(pkNonTerminal, s, p, start) + var oldMl = c.ml + when false: echo "enter: ", p.nt.name + result = mopProc(s, p.nt.rule, start, c) + when false: echo "leave: ", p.nt.name + if result < 0: c.ml = oldMl + leave(pkNonTerminal, s, p, start, result) + of pkSequence: + enter(pkSequence, s, p, start) + var oldMl = c.ml + result = 0 + for i in 0..high(p.sons): + var x = mopProc(s, p.sons[i], start+result, c) + if x < 0: + c.ml = oldMl + result = -1 + break + else: inc(result, x) + leave(pkSequence, s, p, start, result) + of pkOrderedChoice: + enter(pkOrderedChoice, s, p, start) + var oldMl = c.ml + for i in 0..high(p.sons): + result = mopProc(s, p.sons[i], start, c) + if result >= 0: break c.ml = oldMl - result = -1 - break - else: inc(result, x) - of pkOrderedChoice: - var oldMl = c.ml - for i in 0..high(p.sons): - result = rawMatch(s, p.sons[i], start, c) - if result >= 0: break - c.ml = oldMl - of pkSearch: - var oldMl = c.ml - result = 0 - while start+result < s.len: - var x = rawMatch(s, p.sons[0], start+result, c) - if x >= 0: - inc(result, x) - return - inc(result) - result = -1 - c.ml = oldMl - of pkCapturedSearch: - var idx = c.ml # reserve a slot for the subpattern - inc(c.ml) - result = 0 - while start+result < s.len: - var x = rawMatch(s, p.sons[0], start+result, c) - if x >= 0: - if idx < maxSubpatterns: - c.matches[idx] = (start, start+result-1) - #else: silently ignore the capture - inc(result, x) - return - inc(result) - result = -1 - c.ml = idx - of pkGreedyRep: - result = 0 - while true: - var x = rawMatch(s, p.sons[0], start+result, c) - # if x == 0, we have an endless loop; so the correct behaviour would be - # not to break. But endless loops can be easily introduced: - # ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the - # expected thing in this case. - if x <= 0: break - inc(result, x) - of pkGreedyRepChar: - result = 0 - var ch = p.ch - while ch == s[start+result]: inc(result) - of pkGreedyRepSet: - result = 0 - while contains(p.charChoice[], s[start+result]): inc(result) - of pkOption: - result = max(0, rawMatch(s, p.sons[0], start, c)) - of pkAndPredicate: - var oldMl = c.ml - result = rawMatch(s, p.sons[0], start, c) - if result >= 0: result = 0 # do not consume anything - else: c.ml = oldMl - of pkNotPredicate: - var oldMl = c.ml - result = rawMatch(s, p.sons[0], start, c) - if result < 0: result = 0 - else: + leave(pkOrderedChoice, s, p, start, result) + of pkSearch: + enter(pkSearch, s, p, start) + var oldMl = c.ml + result = 0 + while start+result <= s.len: + var x = mopProc(s, p.sons[0], start+result, c) + if x >= 0: + inc(result, x) + leave(pkSearch, s, p, start, result) + return + inc(result) + result = -1 c.ml = oldMl + leave(pkSearch, s, p, start, result) + of pkCapturedSearch: + enter(pkCapturedSearch, s, p, start) + var idx = c.ml # reserve a slot for the subpattern + inc(c.ml) + result = 0 + while start+result <= s.len: + var x = mopProc(s, p.sons[0], start+result, c) + if x >= 0: + if idx < MaxSubpatterns: + c.matches[idx] = (start, start+result-1) + #else: silently ignore the capture + inc(result, x) + leave(pkCapturedSearch, s, p, start, result) + return + inc(result) result = -1 - of pkCapture: - var idx = c.ml # reserve a slot for the subpattern - inc(c.ml) - result = rawMatch(s, p.sons[0], start, c) - if result >= 0: - if idx < maxSubpatterns: - c.matches[idx] = (start, start+result-1) - #else: silently ignore the capture - else: c.ml = idx - of pkBackRef..pkBackRefIgnoreStyle: - if p.index >= c.ml: return -1 - var (a, b) = c.matches[p.index] - var n: TPeg - n.kind = succ(pkTerminal, ord(p.kind)-ord(pkBackRef)) - n.term = s.substr(a, b) - result = rawMatch(s, n, start, c) - of pkStartAnchor: - if c.origStart == start: result = 0 - else: result = -1 - of pkRule, pkList: assert false + leave(pkCapturedSearch, s, p, start, result) + of pkGreedyRep: + enter(pkGreedyRep, s, p, start) + result = 0 + while true: + var x = mopProc(s, p.sons[0], start+result, c) + # if x == 0, we have an endless loop; so the correct behaviour would be + # not to break. But endless loops can be easily introduced: + # ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the + # expected thing in this case. + if x <= 0: break + inc(result, x) + leave(pkGreedyRep, s, p, start, result) + of pkGreedyRepChar: + enter(pkGreedyRepChar, s, p, start) + result = 0 + var ch = p.ch + while start+result < s.len and ch == s[start+result]: inc(result) + leave(pkGreedyRepChar, s, p, start, result) + of pkGreedyRepSet: + enter(pkGreedyRepSet, s, p, start) + result = 0 + while start+result < s.len and contains(p.charChoice[], s[start+result]): + inc(result) + leave(pkGreedyRepSet, s, p, start, result) + of pkOption: + enter(pkOption, s, p, start) + result = max(0, mopProc(s, p.sons[0], start, c)) + leave(pkOption, s, p, start, result) + of pkAndPredicate: + enter(pkAndPredicate, s, p, start) + var oldMl = c.ml + result = mopProc(s, p.sons[0], start, c) + if result >= 0: result = 0 # do not consume anything + else: c.ml = oldMl + leave(pkAndPredicate, s, p, start, result) + of pkNotPredicate: + enter(pkNotPredicate, s, p, start) + var oldMl = c.ml + result = mopProc(s, p.sons[0], start, c) + if result < 0: result = 0 + else: + c.ml = oldMl + result = -1 + leave(pkNotPredicate, s, p, start, result) + of pkCapture: + enter(pkCapture, s, p, start) + if p.sons.len == 0 or p.sons[0].kind == pkEmpty: + # empty capture removes last match + dec(c.ml) + c.matches[c.ml] = (0, 0) + result = 0 # match of length 0 + else: + var idx = c.ml # reserve a slot for the subpattern + result = mopProc(s, p.sons[0], start, c) + if result >= 0: + if idx < MaxSubpatterns: + if idx != c.ml: + for i in countdown(c.ml, idx): + c.matches[i+1] = c.matches[i] + c.matches[idx] = (start, start+result-1) + #else: silently ignore the capture + inc(c.ml) + leave(pkCapture, s, p, start, result) + of pkBackRef: + enter(pkBackRef, s, p, start) + result = matchBackRef(s, p, start, c) + leave(pkBackRef, s, p, start, result) + of pkBackRefIgnoreCase: + enter(pkBackRefIgnoreCase, s, p, start) + result = matchBackRef(s, p, start, c) + leave(pkBackRefIgnoreCase, s, p, start, result) + of pkBackRefIgnoreStyle: + enter(pkBackRefIgnoreStyle, s, p, start) + result = matchBackRef(s, p, start, c) + leave(pkBackRefIgnoreStyle, s, p, start, result) + of pkStartAnchor: + enter(pkStartAnchor, s, p, start) + if c.origStart == start: result = 0 + else: result = -1 + leave(pkStartAnchor, s, p, start, result) + of pkRule, pkList: assert false -template fillMatches(s, caps, c: expr) = - for k in 0..c.ml-1: - caps[k] = substr(s, c.matches[k][0], c.matches[k][1]) +func rawMatch*(s: string, p: Peg, start: int, c: var Captures): int + {.rtl, extern: "npegs$1".} = + ## low-level matching proc that implements the PEG interpreter. Use this + ## for maximum efficiency (every other PEG operation ends up calling this + ## proc). + ## Returns -1 if it does not match, else the length of the match -proc match*(s: string, pattern: TPeg, matches: var openarray[string], - start = 0): bool {.nosideEffect, rtl, extern: "npegs$1Capture".} = - ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and - ## the captured substrings in the array ``matches``. If it does not - ## match, nothing is written into ``matches`` and ``false`` is - ## returned. - var c: TCaptures - c.origStart = start - result = rawMatch(s, pattern, start, c) == len(s) - start - if result: fillMatches(s, matches, c) + # Set the handler generators to produce do-nothing handlers. + template enter(pk, s, p, start) = + discard + template leave(pk, s, p, start, length) = + discard + matchOrParse(matchIt) + {.cast(noSideEffect).}: + # This cast is allowed because the `matchOrParse` template is used for + # both matching and parsing, but side effects are only possible when it's + # used by `eventParser`. + result = matchIt(s, p, start, c) + +macro mkHandlerTplts(handlers: untyped): untyped = + # Transforms the handler spec in *handlers* into handler templates. + # The AST structure of *handlers[0]*: + # + # ``` + # StmtList + # Call + # Ident "pkNonTerminal" + # StmtList + # Call + # Ident "enter" + # StmtList + # <handler code block> + # Call + # Ident "leave" + # StmtList + # <handler code block> + # Call + # Ident "pkChar" + # StmtList + # Call + # Ident "leave" + # StmtList + # <handler code block> + # ... + # ``` + func mkEnter(hdName, body: NimNode): NimNode = + template helper(hdName, body) {.dirty.} = + template hdName(s, p, start) = + let s {.inject.} = s + let p {.inject.} = p + let start {.inject.} = start + body + result = getAst(helper(hdName, body)) + + template mkLeave(hdPostf, body) {.dirty.} = + # this has to be dirty to be able to capture *result* as *length* in + # *leaveXX* calls. + template `leave hdPostf`(s, p, start, length) = + body + + result = newStmtList() + for topCall in handlers[0]: + if topCall.kind notin nnkCallKinds: + error("Call syntax expected.", topCall) + let pegKind = topCall[0] + if pegKind.kind notin {nnkIdent, nnkSym}: + error("PegKind expected.", pegKind) + if 2 == topCall.len: + for hdDef in topCall[1]: + if hdDef.kind notin nnkCallKinds: + error("Call syntax expected.", hdDef) + if hdDef[0].kind notin {nnkIdent, nnkSym}: + error("Handler identifier expected.", hdDef[0]) + if 2 == hdDef.len: + let hdPostf = substr(pegKind.strVal, 2) + case hdDef[0].strVal + of "enter": + result.add mkEnter(newIdentNode("enter" & hdPostf), hdDef[1]) + of "leave": + result.add getAst(mkLeave(ident(hdPostf), hdDef[1])) + else: + error( + "Unsupported handler identifier, expected 'enter' or 'leave'.", + hdDef[0] + ) + +template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) = + ## Generates an interpreting event parser *proc* according to the specified + ## PEG AST and handler code blocks. The *proc* can be called with a string + ## to be parsed and will execute the handler code blocks whenever their + ## associated grammar element is matched. It returns -1 if the string does not + ## match, else the length of the total match. The following example code + ## evaluates an arithmetic expression defined by a simple PEG: + ## + ## ```nim + ## import std/[strutils, pegs] + ## + ## let + ## pegAst = """ + ## Expr <- Sum + ## Sum <- Product (('+' / '-')Product)* + ## Product <- Value (('*' / '/')Value)* + ## Value <- [0-9]+ / '(' Expr ')' + ## """.peg + ## txt = "(5+3)/2-7*22" + ## + ## var + ## pStack: seq[string] = @[] + ## valStack: seq[float] = @[] + ## opStack = "" + ## let + ## parseArithExpr = pegAst.eventParser: + ## pkNonTerminal: + ## enter: + ## pStack.add p.nt.name + ## leave: + ## pStack.setLen pStack.high + ## if length > 0: + ## let matchStr = s.substr(start, start+length-1) + ## case p.nt.name + ## of "Value": + ## try: + ## valStack.add matchStr.parseFloat + ## echo valStack + ## except ValueError: + ## discard + ## of "Sum", "Product": + ## try: + ## let val = matchStr.parseFloat + ## except ValueError: + ## if valStack.len > 1 and opStack.len > 0: + ## valStack[^2] = case opStack[^1] + ## of '+': valStack[^2] + valStack[^1] + ## of '-': valStack[^2] - valStack[^1] + ## of '*': valStack[^2] * valStack[^1] + ## else: valStack[^2] / valStack[^1] + ## valStack.setLen valStack.high + ## echo valStack + ## opStack.setLen opStack.high + ## echo opStack + ## pkChar: + ## leave: + ## if length == 1 and "Value" != pStack[^1]: + ## let matchChar = s[start] + ## opStack.add matchChar + ## echo opStack + ## + ## let pLen = parseArithExpr(txt) + ## ``` + ## + ## The *handlers* parameter consists of code blocks for *PegKinds*, + ## which define the grammar elements of interest. Each block can contain + ## handler code to be executed when the parser enters and leaves text + ## matching the grammar element. An *enter* handler can access the specific + ## PEG AST node being matched as *p*, the entire parsed string as *s* + ## and the position of the matched text segment in *s* as *start*. A *leave* + ## handler can access *p*, *s*, *start* and also the length of the matched + ## text segment as *length*. For an unsuccessful match, the *enter* and + ## *leave* handlers will be executed, with *length* set to -1. + ## + ## Symbols declared in an *enter* handler can be made visible in the + ## corresponding *leave* handler by annotating them with an *inject* pragma. + proc rawParse(s: string, p: Peg, start: int, c: var Captures): int + {.gensym.} = + + # binding from *macros* + bind strVal + + mkHandlerTplts: + handlers + + macro enter(pegKind, s, pegNode, start: untyped): untyped = + # This is called by the matcher code in *matchOrParse* at the + # start of the code for a grammar element of kind *pegKind*. + # Expands to a call to the handler template if one was generated + # by *mkHandlerTplts*. + template mkDoEnter(hdPostf, s, pegNode, start) = + when declared(`enter hdPostf`): + `enter hdPostf`(s, pegNode, start) + else: + discard + let hdPostf = ident(substr($pegKind, 2)) + getAst(mkDoEnter(hdPostf, s, pegNode, start)) + + macro leave(pegKind, s, pegNode, start, length: untyped): untyped = + # Like *enter*, but called at the end of the matcher code for + # a grammar element of kind *pegKind*. + template mkDoLeave(hdPostf, s, pegNode, start, length) = + when declared(`leave hdPostf`): + `leave hdPostf`(s, pegNode, start, length) + else: + discard + let hdPostf = ident(substr($pegKind, 2)) + getAst(mkDoLeave(hdPostf, s, pegNode, start, length)) + + matchOrParse(parseIt) + parseIt(s, p, start, c) + + proc parser(s: string): int {.gensym.} = + # the proc to be returned + var + ms: array[MaxSubpatterns, (int, int)] + cs = Captures(matches: ms, ml: 0, origStart: 0) + rawParse(s, pegAst, 0, cs) + parser -proc match*(s: string, pattern: TPeg, - start = 0): bool {.nosideEffect, rtl, extern: "npegs$1".} = - ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``. - var c: TCaptures - c.origStart = start - result = rawMatch(s, pattern, start, c) == len(s)-start +template fillMatches(s, caps, c) = + for k in 0..c.ml-1: + let startIdx = c.matches[k][0] + let endIdx = c.matches[k][1] + if startIdx != -1: + caps[k] = substr(s, startIdx, endIdx) + else: + caps[k] = "" -proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string], - start = 0): int {.nosideEffect, rtl, extern: "npegs$1Capture".} = +func matchLen*(s: string, pattern: Peg, matches: var openArray[string], + start = 0): int {.rtl, extern: "npegs$1Capture".} = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. - var c: TCaptures + var c: Captures c.origStart = start result = rawMatch(s, pattern, start, c) if result >= 0: fillMatches(s, matches, c) -proc matchLen*(s: string, pattern: TPeg, - start = 0): int {.nosideEffect, rtl, extern: "npegs$1".} = +func matchLen*(s: string, pattern: Peg, + start = 0): int {.rtl, extern: "npegs$1".} = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. - var c: TCaptures + var c: Captures c.origStart = start result = rawMatch(s, pattern, start, c) -proc find*(s: string, pattern: TPeg, matches: var openarray[string], - start = 0): int {.nosideEffect, rtl, extern: "npegs$1Capture".} = +func match*(s: string, pattern: Peg, matches: var openArray[string], + start = 0): bool {.rtl, extern: "npegs$1Capture".} = + ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and + ## the captured substrings in the array ``matches``. If it does not + ## match, nothing is written into ``matches`` and ``false`` is + ## returned. + result = matchLen(s, pattern, matches, start) != -1 + +func match*(s: string, pattern: Peg, + start = 0): bool {.rtl, extern: "npegs$1".} = + ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``. + result = matchLen(s, pattern, start) != -1 + + +func find*(s: string, pattern: Peg, matches: var openArray[string], + start = 0): int {.rtl, extern: "npegs$1Capture".} = ## returns the starting position of ``pattern`` in ``s`` and the captured ## substrings in the array ``matches``. If it does not match, nothing ## is written into ``matches`` and -1 is returned. - var c: TCaptures + var c: Captures c.origStart = start for i in start .. s.len-1: c.ml = 0 @@ -800,15 +1130,15 @@ proc find*(s: string, pattern: TPeg, matches: var openarray[string], return i return -1 # could also use the pattern here: (!P .)* P - -proc findBounds*(s: string, pattern: TPeg, matches: var openarray[string], + +func findBounds*(s: string, pattern: Peg, matches: var openArray[string], start = 0): tuple[first, last: int] {. - nosideEffect, rtl, extern: "npegs$1Capture".} = - ## returns the starting position and end position of ``pattern`` in ``s`` + rtl, extern: "npegs$1Capture".} = + ## returns the starting position and end position of ``pattern`` in ``s`` ## and the captured ## substrings in the array ``matches``. If it does not match, nothing ## is written into ``matches`` and (-1,0) is returned. - var c: TCaptures + var c: Captures c.origStart = start for i in start .. s.len-1: c.ml = 0 @@ -817,45 +1147,44 @@ proc findBounds*(s: string, pattern: TPeg, matches: var openarray[string], fillMatches(s, matches, c) return (i, i+L-1) return (-1, 0) - -proc find*(s: string, pattern: TPeg, - start = 0): int {.nosideEffect, rtl, extern: "npegs$1".} = + +func find*(s: string, pattern: Peg, + start = 0): int {.rtl, extern: "npegs$1".} = ## returns the starting position of ``pattern`` in ``s``. If it does not ## match, -1 is returned. - var c: TCaptures + var c: Captures c.origStart = start for i in start .. s.len-1: if rawMatch(s, pattern, i, c) >= 0: return i return -1 - -iterator findAll*(s: string, pattern: TPeg, start = 0): string = + +iterator findAll*(s: string, pattern: Peg, start = 0): string = ## yields all matching *substrings* of `s` that match `pattern`. - var c: TCaptures + var c: Captures c.origStart = start var i = start while i < s.len: c.ml = 0 var L = rawMatch(s, pattern, i, c) - if L < 0: break - yield substr(s, i, i+L-1) - inc(i, L) - -proc findAll*(s: string, pattern: TPeg, start = 0): seq[string] {. - nosideEffect, rtl, extern: "npegs$1".} = + if L < 0: + inc(i, 1) + else: + yield substr(s, i, i+L-1) + inc(i, L) + +func findAll*(s: string, pattern: Peg, start = 0): seq[string] {. + rtl, extern: "npegs$1".} = ## returns all matching *substrings* of `s` that match `pattern`. - ## If it does not match, @[] is returned. - accumulateResult(findAll(s, pattern, start)) - -when not defined(nimhygiene): - {.pragma: inject.} - -template `=~`*(s: string, pattern: TPeg): bool = - ## This calls ``match`` with an implicit declared ``matches`` array that - ## can be used in the scope of the ``=~`` call: - ## - ## .. code-block:: nimrod + ## If it does not match, `@[]` is returned. + result = @[] + for it in findAll(s, pattern, start): result.add it + +template `=~`*(s: string, pattern: Peg): bool = + ## This calls ``match`` with an implicit declared ``matches`` array that + ## can be used in the scope of the ``=~`` call: ## - ## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}": + ## ```nim + ## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}": ## # matches a key=value pair: ## echo("Key: ", matches[0]) ## echo("Value: ", matches[1]) @@ -866,53 +1195,55 @@ template `=~`*(s: string, pattern: TPeg): bool = ## echo("comment: ", matches[0]) ## else: ## echo("syntax error") - ## - when not definedInScope(matches): - var matches {.inject.}: array[0..pegs.maxSubpatterns-1, string] + ## ``` + bind MaxSubpatterns + when not declaredInScope(matches): + var matches {.inject.} = default(array[0..MaxSubpatterns-1, string]) match(s, pattern, matches) # ------------------------- more string handling ------------------------------ -proc contains*(s: string, pattern: TPeg, start = 0): bool {. - nosideEffect, rtl, extern: "npegs$1".} = +func contains*(s: string, pattern: Peg, start = 0): bool {. + rtl, extern: "npegs$1".} = ## same as ``find(s, pattern, start) >= 0`` return find(s, pattern, start) >= 0 -proc contains*(s: string, pattern: TPeg, matches: var openArray[string], - start = 0): bool {.nosideEffect, rtl, extern: "npegs$1Capture".} = +func contains*(s: string, pattern: Peg, matches: var openArray[string], + start = 0): bool {.rtl, extern: "npegs$1Capture".} = ## same as ``find(s, pattern, matches, start) >= 0`` return find(s, pattern, matches, start) >= 0 -proc startsWith*(s: string, prefix: TPeg, start = 0): bool {. - nosideEffect, rtl, extern: "npegs$1".} = +func startsWith*(s: string, prefix: Peg, start = 0): bool {. + rtl, extern: "npegs$1".} = ## returns true if `s` starts with the pattern `prefix` result = matchLen(s, prefix, start) >= 0 -proc endsWith*(s: string, suffix: TPeg, start = 0): bool {. - nosideEffect, rtl, extern: "npegs$1".} = - ## returns true if `s` ends with the pattern `prefix` - var c: TCaptures +func endsWith*(s: string, suffix: Peg, start = 0): bool {. + rtl, extern: "npegs$1".} = + ## returns true if `s` ends with the pattern `suffix` + var c: Captures c.origStart = start for i in start .. s.len-1: if rawMatch(s, suffix, i, c) == s.len - i: return true -proc replacef*(s: string, sub: TPeg, by: string): string {. - nosideEffect, rtl, extern: "npegs$1".} = +func replacef*(s: string, sub: Peg, by: string): string {. + rtl, extern: "npegs$1".} = ## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by` ## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples: ## - ## .. code-block:: nimrod - ## "var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") + ## ```nim + ## "var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2") + ## ``` ## ## Results in: ## - ## .. code-block:: nimrod - ## + ## ```nim ## "var1<-keykey; val2<-key2key2" + ## ``` result = "" var i = 0 - var caps: array[0..maxSubpatterns-1, string] - var c: TCaptures + var caps: array[0..MaxSubpatterns-1, string] + var c: Captures while i < s.len: c.ml = 0 var x = rawMatch(s, sub, i, c) @@ -925,13 +1256,13 @@ proc replacef*(s: string, sub: TPeg, by: string): string {. inc(i, x) add(result, substr(s, i)) -proc replace*(s: string, sub: TPeg, by = ""): string {. - nosideEffect, rtl, extern: "npegs$1".} = +func replace*(s: string, sub: Peg, by = ""): string {. + rtl, extern: "npegs$1".} = ## Replaces `sub` in `s` by the string `by`. Captures cannot be accessed ## in `by`. result = "" var i = 0 - var c: TCaptures + var c: Captures while i < s.len: var x = rawMatch(s, sub, i, c) if x <= 0: @@ -941,16 +1272,16 @@ proc replace*(s: string, sub: TPeg, by = ""): string {. add(result, by) inc(i, x) add(result, substr(s, i)) - -proc parallelReplace*(s: string, subs: varargs[ - tuple[pattern: TPeg, repl: string]]): string {. - nosideEffect, rtl, extern: "npegs$1".} = + +func parallelReplace*(s: string, subs: varargs[ + tuple[pattern: Peg, repl: string]]): string {. + rtl, extern: "npegs$1".} = ## Returns a modified copy of `s` with the substitutions in `subs` ## applied in parallel. result = "" var i = 0 - var c: TCaptures - var caps: array[0..maxSubpatterns-1, string] + var c: Captures + var caps: array[0..MaxSubpatterns-1, string] while i < s.len: block searchSubs: for j in 0..high(subs): @@ -964,36 +1295,88 @@ proc parallelReplace*(s: string, subs: varargs[ add(result, s[i]) inc(i) # copy the rest: - add(result, substr(s, i)) - -proc transformFile*(infile, outfile: string, - subs: varargs[tuple[pattern: TPeg, repl: string]]) {. - rtl, extern: "npegs$1".} = - ## reads in the file `infile`, performs a parallel replacement (calls - ## `parallelReplace`) and writes back to `outfile`. Raises ``EIO`` if an - ## error occurs. This is supposed to be used for quick scripting. - var x = readFile(infile).string - writeFile(outfile, x.parallelReplace(subs)) - -iterator split*(s: string, sep: TPeg): string = + add(result, substr(s, i)) + +when not defined(nimHasEffectsOf): + {.pragma: effectsOf.} + +func replace*(s: string, sub: Peg, cb: proc( + match: int, cnt: int, caps: openArray[string]): string): string {. + rtl, extern: "npegs$1cb", effectsOf: cb.} = + ## Replaces `sub` in `s` by the resulting strings from the callback. + ## The callback proc receives the index of the current match (starting with 0), + ## the count of captures and an open array with the captures of each match. Examples: + ## + ## ```nim + ## func handleMatches*(m: int, n: int, c: openArray[string]): string = + ## result = "" + ## if m > 0: + ## result.add ", " + ## result.add case n: + ## of 2: c[0].toLower & ": '" & c[1] & "'" + ## of 1: c[0].toLower & ": ''" + ## else: "" + ## + ## let s = "Var1=key1;var2=Key2; VAR3" + ## echo s.replace(peg"{\ident}('='{\ident})* ';'* \s*", handleMatches) + ## ``` + ## + ## Results in: + ## + ## ```nim + ## "var1: 'key1', var2: 'Key2', var3: ''" + ## ``` + result = "" + var i = 0 + var caps: array[0..MaxSubpatterns-1, string] + var c: Captures + var m = 0 + while i < s.len: + c.ml = 0 + var x = rawMatch(s, sub, i, c) + if x <= 0: + add(result, s[i]) + inc(i) + else: + fillMatches(s, caps, c) + add(result, cb(m, c.ml, caps)) + inc(i, x) + inc(m) + add(result, substr(s, i)) + +when not defined(js): + proc transformFile*(infile, outfile: string, + subs: varargs[tuple[pattern: Peg, repl: string]]) {. + rtl, extern: "npegs$1".} = + ## reads in the file `infile`, performs a parallel replacement (calls + ## `parallelReplace`) and writes back to `outfile`. Raises ``IOError`` if an + ## error occurs. This is supposed to be used for quick scripting. + ## + ## **Note**: this proc does not exist while using the JS backend. + var x = readFile(infile) + writeFile(outfile, x.parallelReplace(subs)) + + +iterator split*(s: string, sep: Peg): string = ## Splits the string `s` into substrings. ## ## Substrings are separated by the PEG `sep`. ## Examples: ## - ## .. code-block:: nimrod + ## ```nim ## for word in split("00232this02939is39an22example111", peg"\d+"): - ## writeln(stdout, word) + ## writeLine(stdout, word) + ## ``` ## ## Results in: ## - ## .. code-block:: nimrod + ## ```nim ## "this" ## "is" ## "an" ## "example" - ## - var c: TCaptures + ## ``` + var c: Captures var first = 0 last = 0 @@ -1010,83 +1393,85 @@ iterator split*(s: string, sep: TPeg): string = if first < last: yield substr(s, first, last-1) -proc split*(s: string, sep: TPeg): seq[string] {. - nosideEffect, rtl, extern: "npegs$1".} = +func split*(s: string, sep: Peg): seq[string] {. + rtl, extern: "npegs$1".} = ## Splits the string `s` into substrings. - accumulateResult(split(s, sep)) + result = @[] + for it in split(s, sep): result.add it # ------------------- scanner ------------------------------------------------- type - TModifier = enum + Modifier = enum modNone, modVerbatim, modIgnoreCase, modIgnoreStyle - TTokKind = enum ## enumeration of all tokens - tkInvalid, ## invalid token - tkEof, ## end of file reached - tkAny, ## . - tkAnyRune, ## _ - tkIdentifier, ## abc - tkStringLit, ## "abc" or 'abc' - tkCharSet, ## [^A-Z] - tkParLe, ## '(' - tkParRi, ## ')' - tkCurlyLe, ## '{' - tkCurlyRi, ## '}' - tkCurlyAt, ## '{@}' - tkArrow, ## '<-' - tkBar, ## '/' - tkStar, ## '*' - tkPlus, ## '+' - tkAmp, ## '&' - tkNot, ## '!' - tkOption, ## '?' - tkAt, ## '@' - tkBuiltin, ## \identifier - tkEscaped, ## \\ - tkBackref, ## '$' - tkDollar, ## '$' - tkHat ## '^' - - TToken {.final.} = object ## a token - kind: TTokKind ## the type of the token - modifier: TModifier - literal: string ## the parsed (string) literal - charset: set[char] ## if kind == tkCharSet - index: int ## if kind == tkBackref - - TPegLexer {.inheritable.} = object ## the lexer object. - bufpos: int ## the current position within the buffer - buf: cstring ## the buffer itself - LineNumber: int ## the current line number - lineStart: int ## index of last line start in buffer - colOffset: int ## column to add + TokKind = enum ## enumeration of all tokens + tkInvalid, ## invalid token + tkEof, ## end of file reached + tkAny, ## . + tkAnyRune, ## _ + tkIdentifier, ## abc + tkStringLit, ## "abc" or 'abc' + tkCharSet, ## [^A-Z] + tkParLe, ## '(' + tkParRi, ## ')' + tkCurlyLe, ## '{' + tkCurlyRi, ## '}' + tkCurlyAt, ## '{@}' + tkEmptyCurl, ## '{}' + tkArrow, ## '<-' + tkBar, ## '/' + tkStar, ## '*' + tkPlus, ## '+' + tkAmp, ## '&' + tkNot, ## '!' + tkOption, ## '?' + tkAt, ## '@' + tkBuiltin, ## \identifier + tkEscaped, ## \\ + tkBackref, ## '$' + tkDollar, ## '$' + tkHat ## '^' + + Token {.final.} = object ## a token + kind: TokKind ## the type of the token + modifier: Modifier + literal: string ## the parsed (string) literal + charset: set[char] ## if kind == tkCharSet + index: int ## if kind == tkBackref + + PegLexer {.inheritable.} = object ## the lexer object. + bufpos: int ## the current position within the buffer + buf: string ## the buffer itself + lineNumber: int ## the current line number + lineStart: int ## index of last line start in buffer + colOffset: int ## column to add filename: string const - tokKindToStr: array[TTokKind, string] = [ + tokKindToStr: array[TokKind, string] = [ "invalid", "[EOF]", ".", "_", "identifier", "string literal", - "character set", "(", ")", "{", "}", "{@}", + "character set", "(", ")", "{", "}", "{@}", "{}", "<-", "/", "*", "+", "&", "!", "?", "@", "built-in", "escaped", "$", "$", "^" ] -proc HandleCR(L: var TPegLexer, pos: int): int = +func handleCR(L: var PegLexer, pos: int): int = assert(L.buf[pos] == '\c') - inc(L.linenumber) + inc(L.lineNumber) result = pos+1 - if L.buf[result] == '\L': inc(result) + if result < L.buf.len and L.buf[result] == '\L': inc(result) L.lineStart = result -proc HandleLF(L: var TPegLexer, pos: int): int = +func handleLF(L: var PegLexer, pos: int): int = assert(L.buf[pos] == '\L') - inc(L.linenumber) + inc(L.lineNumber) result = pos+1 L.lineStart = result -proc init(L: var TPegLexer, input, filename: string, line = 1, col = 0) = +func init(L: var PegLexer, input, filename: string, line = 1, col = 0) = L.buf = input L.bufpos = 0 L.lineNumber = line @@ -1094,69 +1479,64 @@ proc init(L: var TPegLexer, input, filename: string, line = 1, col = 0) = L.lineStart = 0 L.filename = filename -proc getColumn(L: TPegLexer): int {.inline.} = +func getColumn(L: PegLexer): int {.inline.} = result = abs(L.bufpos - L.lineStart) + L.colOffset -proc getLine(L: TPegLexer): int {.inline.} = - result = L.linenumber - -proc errorStr(L: TPegLexer, msg: string, line = -1, col = -1): string = +func getLine(L: PegLexer): int {.inline.} = + result = L.lineNumber + +func errorStr(L: PegLexer, msg: string, line = -1, col = -1): string = var line = if line < 0: getLine(L) else: line var col = if col < 0: getColumn(L) else: col result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg] -proc handleHexChar(c: var TPegLexer, xi: var int) = - case c.buf[c.bufpos] - of '0'..'9': - xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('0')) - inc(c.bufpos) - of 'a'..'f': - xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('a') + 10) - inc(c.bufpos) - of 'A'..'F': - xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('A') + 10) - inc(c.bufpos) - else: nil - -proc getEscapedChar(c: var TPegLexer, tok: var TToken) = +func getEscapedChar(c: var PegLexer, tok: var Token) = inc(c.bufpos) + if c.bufpos >= len(c.buf): + tok.kind = tkInvalid + return case c.buf[c.bufpos] - of 'r', 'R', 'c', 'C': + of 'r', 'R', 'c', 'C': add(tok.literal, '\c') - Inc(c.bufpos) - of 'l', 'L': + inc(c.bufpos) + of 'l', 'L': add(tok.literal, '\L') - Inc(c.bufpos) - of 'f', 'F': + inc(c.bufpos) + of 'f', 'F': add(tok.literal, '\f') inc(c.bufpos) - of 'e', 'E': + of 'e', 'E': add(tok.literal, '\e') - Inc(c.bufpos) - of 'a', 'A': + inc(c.bufpos) + of 'a', 'A': add(tok.literal, '\a') - Inc(c.bufpos) - of 'b', 'B': + inc(c.bufpos) + of 'b', 'B': add(tok.literal, '\b') - Inc(c.bufpos) - of 'v', 'V': + inc(c.bufpos) + of 'v', 'V': add(tok.literal, '\v') - Inc(c.bufpos) - of 't', 'T': + inc(c.bufpos) + of 't', 'T': add(tok.literal, '\t') - Inc(c.bufpos) - of 'x', 'X': inc(c.bufpos) + of 'x', 'X': + inc(c.bufpos) + if c.bufpos >= len(c.buf): + tok.kind = tkInvalid + return var xi = 0 - handleHexChar(c, xi) - handleHexChar(c, xi) + if handleHexChar(c.buf[c.bufpos], xi): + inc(c.bufpos) + if handleHexChar(c.buf[c.bufpos], xi): + inc(c.bufpos) if xi == 0: tok.kind = tkInvalid - else: add(tok.literal, Chr(xi)) - of '0'..'9': + else: add(tok.literal, chr(xi)) + of '0'..'9': var val = ord(c.buf[c.bufpos]) - ord('0') - Inc(c.bufpos) + inc(c.bufpos) var i = 1 - while (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}): + while (c.bufpos < len(c.buf)) and (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}): val = val * 10 + ord(c.buf[c.bufpos]) - ord('0') inc(c.bufpos) inc(i) @@ -1164,38 +1544,35 @@ proc getEscapedChar(c: var TPegLexer, tok: var TToken) = else: tok.kind = tkInvalid of '\0'..'\31': tok.kind = tkInvalid - elif c.buf[c.bufpos] in strutils.letters: + elif c.buf[c.bufpos] in strutils.Letters: tok.kind = tkInvalid else: add(tok.literal, c.buf[c.bufpos]) - Inc(c.bufpos) - -proc skip(c: var TPegLexer) = + inc(c.bufpos) + +func skip(c: var PegLexer) = var pos = c.bufpos - var buf = c.buf - while true: - case buf[pos] - of ' ', '\t': - Inc(pos) + while pos < c.buf.len: + case c.buf[pos] + of ' ', '\t': + inc(pos) of '#': - while not (buf[pos] in {'\c', '\L', '\0'}): inc(pos) + while (pos < c.buf.len) and + not (c.buf[pos] in {'\c', '\L', '\0'}): inc(pos) of '\c': - pos = HandleCR(c, pos) - buf = c.buf - of '\L': - pos = HandleLF(c, pos) - buf = c.buf - else: - break # EndOfFile also leaves the loop + pos = handleCR(c, pos) + of '\L': + pos = handleLF(c, pos) + else: + break # EndOfFile also leaves the loop c.bufpos = pos - -proc getString(c: var TPegLexer, tok: var TToken) = + +func getString(c: var PegLexer, tok: var Token) = tok.kind = tkStringLit - var pos = c.bufPos + 1 - var buf = c.buf - var quote = buf[pos-1] - while true: - case buf[pos] + var pos = c.bufpos + 1 + var quote = c.buf[pos-1] + while pos < c.buf.len: + case c.buf[pos] of '\\': c.bufpos = pos getEscapedChar(c, tok) @@ -1203,90 +1580,102 @@ proc getString(c: var TPegLexer, tok: var TToken) = of '\c', '\L', '\0': tok.kind = tkInvalid break - elif buf[pos] == quote: + elif c.buf[pos] == quote: inc(pos) - break + break else: - add(tok.literal, buf[pos]) - Inc(pos) + add(tok.literal, c.buf[pos]) + inc(pos) c.bufpos = pos - -proc getDollar(c: var TPegLexer, tok: var TToken) = - var pos = c.bufPos + 1 - var buf = c.buf - if buf[pos] in {'0'..'9'}: + +func getDollar(c: var PegLexer, tok: var Token) = + var pos = c.bufpos + 1 + var neg = false + if pos < c.buf.len and c.buf[pos] == '^': + neg = true + inc(pos) + if pos < c.buf.len and c.buf[pos] in {'0'..'9'}: tok.kind = tkBackref tok.index = 0 - while buf[pos] in {'0'..'9'}: - tok.index = tok.index * 10 + ord(buf[pos]) - ord('0') + while pos < c.buf.len and c.buf[pos] in {'0'..'9'}: + tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0') inc(pos) + if neg: + tok.index = -tok.index else: + if neg: + dec(pos) tok.kind = tkDollar c.bufpos = pos - -proc getCharSet(c: var TPegLexer, tok: var TToken) = + +func getCharSet(c: var PegLexer, tok: var Token) = tok.kind = tkCharSet tok.charset = {} - var pos = c.bufPos + 1 - var buf = c.buf + var pos = c.bufpos + 1 var caret = false - if buf[pos] == '^': - inc(pos) - caret = true - while true: - var ch: char - case buf[pos] - of ']': + if pos < c.buf.len: + if c.buf[pos] == '^': inc(pos) - break - of '\\': - c.bufpos = pos - getEscapedChar(c, tok) - pos = c.bufpos - ch = tok.literal[tok.literal.len-1] - of '\C', '\L', '\0': - tok.kind = tkInvalid - break - else: - ch = buf[pos] - Inc(pos) - incl(tok.charset, ch) - if buf[pos] == '-': - if buf[pos+1] == ']': - incl(tok.charset, '-') - inc(pos) + caret = true + while pos < c.buf.len: + var ch: char + case c.buf[pos] + of ']': + if pos < c.buf.len: inc(pos) + break + of '\\': + c.bufpos = pos + getEscapedChar(c, tok) + pos = c.bufpos + ch = tok.literal[tok.literal.len-1] + of '\C', '\L', '\0': + tok.kind = tkInvalid + break else: + ch = c.buf[pos] inc(pos) - var ch2: char - case buf[pos] - of '\\': - c.bufpos = pos - getEscapedChar(c, tok) - pos = c.bufpos - ch2 = tok.literal[tok.literal.len-1] - of '\C', '\L', '\0': - tok.kind = tkInvalid - break - else: - ch2 = buf[pos] - Inc(pos) - for i in ord(ch)+1 .. ord(ch2): - incl(tok.charset, chr(i)) + incl(tok.charset, ch) + if c.buf[pos] == '-': + if pos+1 < c.buf.len and c.buf[pos+1] == ']': + incl(tok.charset, '-') + inc(pos) + else: + if pos+1 < c.buf.len: + inc(pos) + else: + break + var ch2: char + case c.buf[pos] + of '\\': + c.bufpos = pos + getEscapedChar(c, tok) + pos = c.bufpos + ch2 = tok.literal[tok.literal.len-1] + of '\C', '\L', '\0': + tok.kind = tkInvalid + break + else: + if pos+1 < c.buf.len: + ch2 = c.buf[pos] + inc(pos) + else: + break + for i in ord(ch)+1 .. ord(ch2): + incl(tok.charset, chr(i)) c.bufpos = pos if caret: tok.charset = {'\1'..'\xFF'} - tok.charset - -proc getSymbol(c: var TPegLexer, tok: var TToken) = + +func getSymbol(c: var PegLexer, tok: var Token) = var pos = c.bufpos - var buf = c.buf - while true: - add(tok.literal, buf[pos]) - Inc(pos) - if buf[pos] notin strutils.IdentChars: break + while pos < c.buf.len: + add(tok.literal, c.buf[pos]) + inc(pos) + if pos < c.buf.len and c.buf[pos] notin strutils.IdentChars: break c.bufpos = pos tok.kind = tkIdentifier -proc getBuiltin(c: var TPegLexer, tok: var TToken) = - if c.buf[c.bufpos+1] in strutils.Letters: +func getBuiltin(c: var PegLexer, tok: var Token) = + if c.bufpos+1 < c.buf.len and c.buf[c.bufpos+1] in strutils.Letters: inc(c.bufpos) getSymbol(c, tok) tok.kind = tkBuiltin @@ -1294,36 +1683,49 @@ proc getBuiltin(c: var TPegLexer, tok: var TToken) = tok.kind = tkEscaped getEscapedChar(c, tok) # may set tok.kind to tkInvalid -proc getTok(c: var TPegLexer, tok: var TToken) = +func getTok(c: var PegLexer, tok: var Token) = tok.kind = tkInvalid tok.modifier = modNone - setlen(tok.literal, 0) + setLen(tok.literal, 0) skip(c) + + if c.bufpos >= c.buf.len: + tok.kind = tkEof + tok.literal = "[EOF]" + add(tok.literal, '\0') + inc(c.bufpos) + return + case c.buf[c.bufpos] of '{': inc(c.bufpos) - if c.buf[c.bufpos] == '@' and c.buf[c.bufpos+1] == '}': + if c.buf[c.bufpos] == '@' and c.bufpos+2 < c.buf.len and + c.buf[c.bufpos+1] == '}': tok.kind = tkCurlyAt inc(c.bufpos, 2) add(tok.literal, "{@}") + elif c.buf[c.bufpos] == '}' and c.bufpos < c.buf.len: + tok.kind = tkEmptyCurl + inc(c.bufpos) + add(tok.literal, "{}") else: tok.kind = tkCurlyLe add(tok.literal, '{') - of '}': + of '}': tok.kind = tkCurlyRi inc(c.bufpos) add(tok.literal, '}') - of '[': - getCharset(c, tok) + of '[': + getCharSet(c, tok) of '(': tok.kind = tkParLe - Inc(c.bufpos) + inc(c.bufpos) add(tok.literal, '(') of ')': tok.kind = tkParRi - Inc(c.bufpos) + inc(c.bufpos) add(tok.literal, ')') - of '.': + of '.': tok.kind = tkAny inc(c.bufpos) add(tok.literal, '.') @@ -1331,22 +1733,22 @@ proc getTok(c: var TPegLexer, tok: var TToken) = tok.kind = tkAnyRune inc(c.bufpos) add(tok.literal, '_') - of '\\': + of '\\': getBuiltin(c, tok) of '\'', '"': getString(c, tok) of '$': getDollar(c, tok) - of '\0': - tok.kind = tkEof - tok.literal = "[EOF]" of 'a'..'z', 'A'..'Z', '\128'..'\255': getSymbol(c, tok) - if c.buf[c.bufpos] in {'\'', '"'} or - c.buf[c.bufpos] == '$' and c.buf[c.bufpos+1] in {'0'..'9'}: + if c.bufpos >= c.buf.len: + return + if c.buf[c.bufpos] in {'\'', '"'} or + c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and + c.buf[c.bufpos+1] in {'^', '0'..'9'}: case tok.literal of "i": tok.modifier = modIgnoreCase of "y": tok.modifier = modIgnoreStyle of "v": tok.modifier = modVerbatim - else: nil + else: discard setLen(tok.literal, 0) if c.buf[c.bufpos] == '$': getDollar(c, tok) @@ -1362,7 +1764,7 @@ proc getTok(c: var TPegLexer, tok: var TToken) = inc(c.bufpos) add(tok.literal, '+') of '<': - if c.buf[c.bufpos+1] == '-': + if c.bufpos+2 < c.buf.len and c.buf[c.bufpos+1] == '-': inc(c.bufpos, 2) tok.kind = tkArrow add(tok.literal, "<-") @@ -1388,7 +1790,7 @@ proc getTok(c: var TPegLexer, tok: var TToken) = tok.kind = tkAt inc(c.bufpos) add(tok.literal, '@') - if c.buf[c.bufpos] == '@': + if c.buf[c.bufpos] == '@': tok.kind = tkCurlyAt inc(c.bufpos) add(tok.literal, '@') @@ -1397,45 +1799,48 @@ proc getTok(c: var TPegLexer, tok: var TToken) = inc(c.bufpos) add(tok.literal, '^') else: + if c.bufpos >= c.buf.len: + tok.kind = tkEof + tok.literal = "[EOF]" add(tok.literal, c.buf[c.bufpos]) inc(c.bufpos) -proc arrowIsNextTok(c: TPegLexer): bool = +func arrowIsNextTok(c: PegLexer): bool = # the only look ahead we need var pos = c.bufpos - while c.buf[pos] in {'\t', ' '}: inc(pos) + while pos < c.buf.len and c.buf[pos] in {'\t', ' '}: inc(pos) + if pos+1 >= c.buf.len: + return result = c.buf[pos] == '<' and c.buf[pos+1] == '-' # ----------------------------- parser ---------------------------------------- - + type - EInvalidPeg* = object of EInvalidValue ## raised if an invalid - ## PEG has been detected - TPegParser = object of TPegLexer ## the PEG parser object - tok: TToken - nonterms: seq[PNonTerminal] - modifier: TModifier + EInvalidPeg* = object of ValueError ## raised if an invalid + ## PEG has been detected + PegParser = object of PegLexer ## the PEG parser object + tok: Token + nonterms: seq[NonTerminal] + modifier: Modifier captures: int identIsVerbatim: bool - skip: TPeg + skip: Peg -proc pegError(p: TPegParser, msg: string, line = -1, col = -1) = - var e: ref EInvalidPeg - new(e) - e.msg = errorStr(p, msg, line, col) +func pegError(p: PegParser, msg: string, line = -1, col = -1) = + var e = (ref EInvalidPeg)(msg: errorStr(p, msg, line, col)) raise e -proc getTok(p: var TPegParser) = +func getTok(p: var PegParser) = getTok(p, p.tok) - if p.tok.kind == tkInvalid: pegError(p, "invalid token") + if p.tok.kind == tkInvalid: pegError(p, "'" & p.tok.literal & "' is invalid token") -proc eat(p: var TPegParser, kind: TTokKind) = +func eat(p: var PegParser, kind: TokKind) = if p.tok.kind == kind: getTok(p) else: pegError(p, tokKindToStr[kind] & " expected") -proc parseExpr(p: var TPegParser): TPeg +func parseExpr(p: var PegParser): Peg {.gcsafe.} -proc getNonTerminal(p: var TPegParser, name: string): PNonTerminal = +func getNonTerminal(p: var PegParser, name: string): NonTerminal = for i in 0..high(p.nonterms): result = p.nonterms[i] if cmpIgnoreStyle(result.name, name) == 0: return @@ -1443,43 +1848,46 @@ proc getNonTerminal(p: var TPegParser, name: string): PNonTerminal = result = newNonTerminal(name, getLine(p), getColumn(p)) add(p.nonterms, result) -proc modifiedTerm(s: string, m: TModifier): TPeg = +func modifiedTerm(s: string, m: Modifier): Peg = case m of modNone, modVerbatim: result = term(s) of modIgnoreCase: result = termIgnoreCase(s) of modIgnoreStyle: result = termIgnoreStyle(s) -proc modifiedBackref(s: int, m: TModifier): TPeg = +func modifiedBackref(s: int, m: Modifier): Peg = + var + reverse = s < 0 + index = if reverse: -s else: s case m - of modNone, modVerbatim: result = backRef(s) - of modIgnoreCase: result = backRefIgnoreCase(s) - of modIgnoreStyle: result = backRefIgnoreStyle(s) + of modNone, modVerbatim: result = backref(index, reverse) + of modIgnoreCase: result = backrefIgnoreCase(index, reverse) + of modIgnoreStyle: result = backrefIgnoreStyle(index, reverse) -proc builtin(p: var TPegParser): TPeg = +func builtin(p: var PegParser): Peg = # do not use "y", "skip" or "i" as these would be ambiguous case p.tok.literal of "n": result = newLine() - of "d": result = charset({'0'..'9'}) - of "D": result = charset({'\1'..'\xff'} - {'0'..'9'}) - of "s": result = charset({' ', '\9'..'\13'}) - of "S": result = charset({'\1'..'\xff'} - {' ', '\9'..'\13'}) - of "w": result = charset({'a'..'z', 'A'..'Z', '_', '0'..'9'}) - of "W": result = charset({'\1'..'\xff'} - {'a'..'z','A'..'Z','_','0'..'9'}) - of "a": result = charset({'a'..'z', 'A'..'Z'}) - of "A": result = charset({'\1'..'\xff'} - {'a'..'z', 'A'..'Z'}) + of "d": result = charSet({'0'..'9'}) + of "D": result = charSet({'\1'..'\xff'} - {'0'..'9'}) + of "s": result = charSet({' ', '\9'..'\13'}) + of "S": result = charSet({'\1'..'\xff'} - {' ', '\9'..'\13'}) + of "w": result = charSet({'a'..'z', 'A'..'Z', '_', '0'..'9'}) + of "W": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_', '0'..'9'}) + of "a": result = charSet({'a'..'z', 'A'..'Z'}) + of "A": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z'}) of "ident": result = pegs.ident - of "letter": result = UnicodeLetter() - of "upper": result = UnicodeUpper() - of "lower": result = UnicodeLower() - of "title": result = UnicodeTitle() - of "white": result = UnicodeWhitespace() + of "letter": result = unicodeLetter() + of "upper": result = unicodeUpper() + of "lower": result = unicodeLower() + of "title": result = unicodeTitle() + of "white": result = unicodeWhitespace() else: pegError(p, "unknown built-in: " & p.tok.literal) -proc token(terminal: TPeg, p: TPegParser): TPeg = +func token(terminal: Peg, p: PegParser): Peg = if p.skip.kind == pkEmpty: result = terminal else: result = sequence(p.skip, terminal) -proc primary(p: var TPegParser): TPeg = +func primary(p: var PegParser): Peg = case p.tok.kind of tkAmp: getTok(p) @@ -1493,18 +1901,19 @@ proc primary(p: var TPegParser): TPeg = of tkCurlyAt: getTok(p) return !*\primary(p).token(p) - else: nil + else: discard case p.tok.kind of tkIdentifier: - if p.identIsVerbatim: + if p.identIsVerbatim: var m = p.tok.modifier if m == modNone: m = p.modifier result = modifiedTerm(p.tok.literal, m).token(p) getTok(p) elif not arrowIsNextTok(p): var nt = getNonTerminal(p, p.tok.literal) - incl(nt.flags, ntUsed) - result = nonTerminal(nt).token(p) + {.cast(noSideEffect).}: + incl(nt.flags, ntUsed) + result = nonterminal(nt).token(p) getTok(p) else: pegError(p, "expression expected, but found: " & p.tok.literal) @@ -1516,7 +1925,7 @@ proc primary(p: var TPegParser): TPeg = of tkCharSet: if '\0' in p.tok.charset: pegError(p, "binary zero ('\\0') not allowed in character class") - result = charset(p.tok.charset).token(p) + result = charSet(p.tok.charset).token(p) getTok(p) of tkParLe: getTok(p) @@ -1527,6 +1936,9 @@ proc primary(p: var TPegParser): TPeg = result = capture(parseExpr(p)).token(p) eat(p, tkCurlyRi) inc(p.captures) + of tkEmptyCurl: + result = capture() + getTok(p) of tkAny: result = any().token(p) getTok(p) @@ -1539,18 +1951,18 @@ proc primary(p: var TPegParser): TPeg = of tkEscaped: result = term(p.tok.literal[0]).token(p) getTok(p) - of tkDollar: + of tkDollar: result = endAnchor() getTok(p) - of tkHat: + of tkHat: result = startAnchor() getTok(p) of tkBackref: + if abs(p.tok.index) > p.captures or p.tok.index == 0: + pegError(p, "invalid back reference index: " & $p.tok.index) var m = p.tok.modifier if m == modNone: m = p.modifier - result = modifiedBackRef(p.tok.index, m).token(p) - if p.tok.index < 0 or p.tok.index > p.captures: - pegError(p, "invalid back reference index: " & $p.tok.index) + result = modifiedBackref(p.tok.index, m).token(p) getTok(p) else: pegError(p, "expression expected, but found: " & p.tok.literal) @@ -1568,13 +1980,13 @@ proc primary(p: var TPegParser): TPeg = getTok(p) else: break -proc seqExpr(p: var TPegParser): TPeg = +func seqExpr(p: var PegParser): Peg = result = primary(p) while true: case p.tok.kind - of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe, - tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref, - tkHat, tkCurlyAt: + of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe, + tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref, + tkHat, tkCurlyAt, tkEmptyCurl: result = sequence(result, primary(p)) of tkIdentifier: if not arrowIsNextTok(p): @@ -1582,27 +1994,29 @@ proc seqExpr(p: var TPegParser): TPeg = else: break else: break -proc parseExpr(p: var TPegParser): TPeg = +func parseExpr(p: var PegParser): Peg = result = seqExpr(p) while p.tok.kind == tkBar: getTok(p) result = result / seqExpr(p) - -proc parseRule(p: var TPegParser): PNonTerminal = + +func parseRule(p: var PegParser): NonTerminal = if p.tok.kind == tkIdentifier and arrowIsNextTok(p): result = getNonTerminal(p, p.tok.literal) if ntDeclared in result.flags: pegError(p, "attempt to redefine: " & result.name) - result.line = getLine(p) - result.col = getColumn(p) + {.cast(noSideEffect).}: + result.line = getLine(p) + result.col = getColumn(p) getTok(p) eat(p, tkArrow) - result.rule = parseExpr(p) - incl(result.flags, ntDeclared) # NOW inlining may be attempted + {.cast(noSideEffect).}: + result.rule = parseExpr(p) + incl(result.flags, ntDeclared) # NOW inlining may be attempted else: pegError(p, "rule expected, but found: " & p.tok.literal) - -proc rawParse(p: var TPegParser): TPeg = + +func rawParse(p: var PegParser): Peg = ## parses a rule or a PEG expression while p.tok.kind == tkBuiltin: case p.tok.literal @@ -1632,12 +2046,12 @@ proc rawParse(p: var TPegParser): TPeg = elif ntUsed notin nt.flags and i > 0: pegError(p, "unused rule: " & nt.name, nt.line, nt.col) -proc parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): TPeg = - ## constructs a TPeg object from `pattern`. `filename`, `line`, `col` are +func parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): Peg = + ## constructs a Peg object from `pattern`. `filename`, `line`, `col` are ## used for error messages, but they only provide start offsets. `parsePeg` ## keeps track of line and column numbers within `pattern`. - var p: TPegParser - init(TPegLexer(p), pattern, filename, line, col) + var p: PegParser + init(PegLexer(p), pattern, filename, line, col) p.tok.kind = tkInvalid p.tok.modifier = modNone p.tok.literal = "" @@ -1647,14 +2061,14 @@ proc parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): TPeg = getTok(p) result = rawParse(p) -proc peg*(pattern: string): TPeg = - ## constructs a TPeg object from the `pattern`. The short name has been - ## chosen to encourage its use as a raw string modifier:: +func peg*(pattern: string): Peg = + ## constructs a Peg object from the `pattern`. The short name has been + ## chosen to encourage its use as a raw string modifier: ## - ## peg"{\ident} \s* '=' \s* {.*}" + ## peg"{\ident} \s* '=' \s* {.*}" result = parsePeg(pattern, "pattern") -proc escapePeg*(s: string): string = +func escapePeg*(s: string): string = ## escapes `s` so that it is matched verbatim when used as a peg. result = "" var inQuote = false @@ -1672,102 +2086,3 @@ proc escapePeg*(s: string): string = inQuote = true result.add(c) if inQuote: result.add('\'') - -when isMainModule: - assert escapePeg("abc''def'") == r"'abc'\x27\x27'def'\x27" - assert match("(a b c)", peg"'(' @ ')'") - assert match("W_HI_Le", peg"\y 'while'") - assert(not match("W_HI_L", peg"\y 'while'")) - assert(not match("W_HI_Le", peg"\y v'while'")) - assert match("W_HI_Le", peg"y'while'") - - assert($ +digits == $peg"\d+") - assert "0158787".match(peg"\d+") - assert "ABC 0232".match(peg"\w+\s+\d+") - assert "ABC".match(peg"\d+ / \w+") - - for word in split("00232this02939is39an22example111", peg"\d+"): - writeln(stdout, word) - - assert matchLen("key", ident) == 3 - - var pattern = sequence(ident, *whitespace, term('='), *whitespace, ident) - assert matchLen("key1= cal9", pattern) == 11 - - var ws = newNonTerminal("ws", 1, 1) - ws.rule = *whitespace - - var expr = newNonTerminal("expr", 1, 1) - expr.rule = sequence(capture(ident), *sequence( - nonterminal(ws), term('+'), nonterminal(ws), nonterminal(expr))) - - var c: TCaptures - var s = "a+b + c +d+e+f" - assert rawMatch(s, expr.rule, 0, c) == len(s) - var a = "" - for i in 0..c.ml-1: - a.add(substr(s, c.matches[i][0], c.matches[i][1])) - assert a == "abcdef" - #echo expr.rule - - #const filename = "lib/devel/peg/grammar.txt" - #var grammar = parsePeg(newFileStream(filename, fmRead), filename) - #echo "a <- [abc]*?".match(grammar) - assert find("_____abc_______", term("abc"), 2) == 5 - assert match("_______ana", peg"A <- 'ana' / . A") - assert match("abcs%%%", peg"A <- ..A / .A / '%'") - - if "abc" =~ peg"{'a'}'bc' 'xyz' / {\ident}": - assert matches[0] == "abc" - else: - assert false - - var g2 = peg"""S <- A B / C D - A <- 'a'+ - B <- 'b'+ - C <- 'c'+ - D <- 'd'+ - """ - assert($g2 == "((A B) / (C D))") - assert match("cccccdddddd", g2) - assert("var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2") == - "var1<-keykey; var2<-key2key2") - assert("var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") == - "$1<-$2$2; $1<-$2$2") - assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}") - - if "aaaaaa" =~ peg"'aa' !. / ({'a'})+": - assert matches[0] == "a" - else: - assert false - - if match("abcdefg", peg"c {d} ef {g}", matches, 2): - assert matches[0] == "d" - assert matches[1] == "g" - else: - assert false - - for x in findAll("abcdef", peg".", 3): - echo x - - for x in findAll("abcdef", peg"^{.}", 3): - assert x == "d" - - if "f(a, b)" =~ peg"{[0-9]+} / ({\ident} '(' {@} ')')": - assert matches[0] == "f" - assert matches[1] == "a, b" - else: - assert false - - assert match("eine übersicht und außerdem", peg"(\letter \white*)+") - # ß is not a lower cased letter?! - assert match("eine übersicht und auerdem", peg"(\lower \white*)+") - assert match("EINE ÜBERSICHT UND AUSSERDEM", peg"(\upper \white*)+") - assert(not match("456678", peg"(\letter)+")) - - assert("var1 = key; var2 = key2".replacef( - peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") == - "var1<-keykey;var2<-key2key2") - - assert match("prefix/start", peg"^start$", 7) - |