# # # Nimrod's Runtime Library # (c) Copyright 2009 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## Simple PEG (Parsing expression grammar) matching. Uses no memorization, but ## uses superoperators and symbol inlining to improve performance. Note: ## Matching performance is hopefully competitive with optimized regular ## expression engines. ## ## .. include:: ../doc/pegdocs.txt ## const useUnicode = true ## change this to deactivate proper UTF-8 support import strutils when useUnicode: import unicode const InlineThreshold = 5 ## number of leaves; -1 to disable inlining type TPegKind = enum pkEmpty, pkAny, ## any character (.) pkAnyRune, ## any Unicode character (_) pkNewLine, ## CR-LF, LF, CR pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, ## single character to match pkCharChoice, pkNonTerminal, pkSequence, ## a b c ... --> Internal DSL: peg(a, b, c) pkOrderedChoice, ## a / b / ... --> Internal DSL: a / b or /[a, b, c] pkGreedyRep, ## a* --> Internal DSL: *a ## a+ --> (a a*) pkGreedyRepChar, ## x* where x is a single character (superop) pkGreedyRepSet, ## [set]* (superop) pkGreedyAny, ## .* or _* (superop) pkOption, ## a? --> Internal DSL: ?a pkAndPredicate, ## &a --> Internal DSL: &a pkNotPredicate, ## !a --> Internal DSL: !a pkCapture, ## {a} --> Internal DSL: capture(a) pkSearch, ## @a --> Internal DSL: @a pkRule, ## a <- b pkList ## a, b TNonTerminalFlag = enum ntDeclared, ntUsed TNonTerminal {.final.} = object ## represents a non terminal symbol name: string ## the name of the symbol line: int ## the line the symbol has been declared/used in col: int ## the column the symbol has been declared/used in flags: set[TNonTerminalFlag] ## the nonterminal's flags rule: TNode ## the rule that the symbol refers to TNode {.final.} = object case kind: TPegKind of pkEmpty, pkAny, pkAnyRune, pkGreedyAny, pkNewLine: nil of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string of pkChar, pkGreedyRepChar: ch: char of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char] of pkNonTerminal: nt: PNonTerminal else: sons: seq[TNode] PNonTerminal* = ref TNonTerminal TPeg* = TNode ## type that represents a PEG proc term*(t: string): TPeg = ## constructs a PEG from a terminal string if t.len != 1: result.kind = pkTerminal result.term = t else: result.kind = pkChar result.ch = t[0] proc termIgnoreCase*(t: string): TPeg = ## constructs a PEG from a terminal string; ignore case for matching result.kind = pkTerminalIgnoreCase result.term = t proc termIgnoreStyle*(t: string): TPeg = ## constructs a PEG from a terminal string; ignore style for matching result.kind = pkTerminalIgnoreStyle result.term = t proc term*(t: char): TPeg = ## constructs a PEG from a terminal char assert t != '\0' result.kind = pkChar result.ch = t proc charSet*(s: set[char]): TPeg = ## constructs a PEG from a character set `s` assert '\0' notin s result.kind = pkCharChoice new(result.charChoice) result.charChoice^ = s proc len(a: TPeg): int {.inline.} = return a.sons.len proc add(d: var TPeg, s: TPeg) {.inline.} = add(d.sons, s) proc addChoice(dest: var TPeg, elem: TPeg) = var L = dest.len-1 if L >= 0 and dest.sons[L].kind == pkCharChoice: case elem.kind of pkCharChoice: dest.sons[L].charChoice^ = dest.sons[L].charChoice^ + elem.charChoice^ of pkChar: incl(dest.sons[L].charChoice^, elem.ch) else: add(dest, elem) else: add(dest, elem) template multipleOp(k: TPegKind, localOpt: expr) = result.kind = k result.sons = @[] for x in items(a): if x.kind == k: for y in items(x.sons): localOpt(result, y) else: localOpt(result, x) if result.len == 1: result = result.sons[0] proc `/`*(a: openArray[TPeg]): TPeg = ## constructs an ordered choice with the PEGs in `a` multipleOp(pkOrderedChoice, addChoice) proc addSequence(dest: var TPeg, elem: TPeg) = var L = dest.len-1 if L >= 0 and dest.sons[L].kind == pkTerminal: case elem.kind of pkTerminal: add(dest.sons[L].term, elem.term) of pkChar: add(dest.sons[L].term, elem.ch) else: add(dest, elem) else: add(dest, elem) proc sequence*(a: openArray[TPeg]): TPeg = ## constructs a sequence with all the PEGs from `a` multipleOp(pkSequence, addSequence) proc `?`*(a: TPeg): TPeg = ## constructs an optional for the PEG `a` if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar, pkGreedyRepSet}: # a* ? --> a* # a? ? --> a? result = a else: result.kind = pkOption result.sons = @[a] proc `*`*(a: TPeg): TPeg = ## constructs a "greedy repetition" for the PEG `a` case a.kind of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption: assert false # produces endless loop! of pkChar: result.kind = pkGreedyRepChar result.ch = a.ch of pkCharChoice: result.kind = pkGreedyRepSet result.charChoice = a.charChoice # copying a reference suffices! of pkAny, pkAnyRune: result.kind = pkGreedyAny else: result.kind = pkGreedyRep result.sons = @[a] proc `@`*(a: TPeg): TPeg = ## constructs a "search" for the PEG `a` result.kind = pkSearch result.sons = @[a] when false: proc contains(a: TPeg, k: TPegKind): bool = if a.kind == k: return true case a.kind of pkEmpty, pkAny, pkAnyRune, pkGreedyAny, pkNewLine, pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, pkGreedyRepChar, pkCharChoice, pkGreedyRepSet: nil of pkNonTerminal: return true else: for i in 0..a.sons.len-1: if contains(a.sons[i], k): return true proc `+`*(a: TPeg): TPeg = ## constructs a "greedy positive repetition" with the PEG `a` return sequence(a, *a) proc `&`*(a: TPeg): TPeg = ## constructs an "and predicate" with the PEG `a` result.kind = pkAndPredicate result.sons = @[a] proc `!`*(a: TPeg): TPeg = ## constructs a "not predicate" with the PEG `a` result.kind = pkNotPredicate result.sons = @[a] proc any*: TPeg {.inline.} = ## constructs the PEG `any character`:idx: (``.``) result.kind = pkAny proc anyRune*: TPeg {.inline.} = ## constructs the PEG `any rune`:idx: (``_``) result.kind = pkAnyRune proc newLine*: TPeg {.inline.} = ## constructs the PEG `newline`:idx: (``\n``) result.kind = pkNewline proc capture*(a: TPeg): TPeg = ## constructs a capture with the PEG `a` result.kind = pkCapture result.sons = @[a] proc spaceCost(n: TPeg): int = case n.kind of pkEmpty: nil of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, pkGreedyRepChar, pkCharChoice, pkGreedyRepSet, pkAny, pkAnyRune, pkNewLine, pkGreedyAny: result = 1 of pkNonTerminal: # we cannot inline a rule with a non-terminal result = InlineThreshold+1 else: for i in 0..n.len-1: inc(result, spaceCost(n.sons[i])) if result >= InlineThreshold: break proc nonterminal*(n: PNonTerminal): TPeg = ## constructs a PEG that consists of the nonterminal symbol assert n != nil if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold: when false: echo "inlining symbol: ", n.name result = n.rule # inlining of rule enables better optimizations else: result.kind = pkNonTerminal result.nt = n proc newNonTerminal*(name: string, line, column: int): PNonTerminal = ## constructs a nonterminal symbol new(result) result.name = name result.line = line result.col = column template letters*: expr = ## expands to ``charset({'A'..'Z', 'a'..'z'})`` charset({'A'..'Z', 'a'..'z'}) template digits*: expr = ## expands to ``charset({'0'..'9'})`` charset({'0'..'9'}) template whitespace*: expr = ## expands to ``charset({' ', '\9'..'\13'})`` charset({' ', '\9'..'\13'}) template identChars*: expr = ## expands to ``charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})`` charset({'a'..'z', 'A'..'Z', '0'..'9', '_'}) template identStartChars*: expr = ## expands to ``charset({'A'..'Z', 'a'..'z', '_'})`` charset({'a'..'z', 'A'..'Z', '_'}) template ident*: expr = ## same as ``[a-zA-Z_][a-zA-z_0-9]*``; standard identifier sequence(charset({'a'..'z', 'A'..'Z', '_'}), *charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})) template natural*: expr = ## same as ``\d+`` +digits const MaxSubpatterns* = 10 ## defines the maximum number of subpatterns that ## can be captured. More subpatterns cannot be captured! # ------------------------- debugging ----------------------------------------- proc esc(c: char, reserved = {'\0'..'\255'}): string = case c of '\b': result = "\\b" of '\t': result = "\\t" of '\c': result = "\\c" of '\L': result = "\\l" of '\v': result = "\\v" of '\f': result = "\\f" of '\e': result = "\\e" of '\a': result = "\\a" of '\\': result = "\\\\" of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c elif c < ' ' or c >= '\128': result = '\\' & $ord(c) elif c in reserved: result = '\\' & c else: result = $c proc singleQuoteEsc(c: Char): string = return "'" & esc(c, {'\''}) & "'" proc singleQuoteEsc(str: string): string = result = "'" for c in items(str): add result, esc(c, {'\''}) add result, '\'' proc charSetEscAux(cc: set[char]): string = const reserved = {'^', '-', ']'} result = "" var c1 = 0 while c1 <= 0xff: if chr(c1) in cc: var c2 = c1 while c2 < 0xff and chr(succ(c2)) in cc: inc(c2) if c1 == c2: add result, esc(chr(c1), reserved) elif c2 == succ(c1): add result, esc(chr(c1), reserved) & esc(chr(c2), reserved) else: add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved) c1 = c2 inc(c1) proc CharSetEsc(cc: set[char]): string = if card(cc) >= 128+64: result = "[^" & CharSetEscAux({'\1'..'\xFF'} - cc) & ']' else: result = '[' & CharSetEscAux(cc) & ']' proc toStrAux(r: TPeg, res: var string) = case r.kind of pkEmpty: add(res, "()") of pkAny: add(res, '.') of pkAnyRune: add(res, '_') of pkNewline: add(res, "\\n") of pkTerminal: add(res, singleQuoteEsc(r.term)) of pkTerminalIgnoreCase: add(res, 'i') add(res, singleQuoteEsc(r.term)) of pkTerminalIgnoreStyle: add(res, 'y') add(res, singleQuoteEsc(r.term)) of pkChar: add(res, singleQuoteEsc(r.ch)) of pkCharChoice: add(res, charSetEsc(r.charChoice^)) of pkNonTerminal: add(res, r.nt.name) of pkSequence: add(res, '(') toStrAux(r.sons[0], res) for i in 1 .. high(r.sons): add(res, ' ') toStrAux(r.sons[i], res) add(res, ')') of pkOrderedChoice: add(res, '(') toStrAux(r.sons[0], res) for i in 1 .. high(r.sons): add(res, " / ") toStrAux(r.sons[i], res) add(res, ')') of pkGreedyRep: toStrAux(r.sons[0], res) add(res, '*') of pkGreedyRepChar: add(res, singleQuoteEsc(r.ch)) add(res, '*') of pkGreedyRepSet: add(res, charSetEsc(r.charChoice^)) add(res, '*') of pkGreedyAny: add(res, ".*") of pkOption: toStrAux(r.sons[0], res) add(res, '?') of pkAndPredicate: add(res, '&') toStrAux(r.sons[0], res) of pkNotPredicate: add(res, '!') toStrAux(r.sons[0], res) of pkSearch: add(res, '@') toStrAux(r.sons[0], res) of pkCapture: add(res, '{') toStrAux(r.sons[0], res) add(res, '}') of pkRule: toStrAux(r.sons[0], res) add(res, " <- ") toStrAux(r.sons[1], res) of pkList: for i in 0 .. high(r.sons): toStrAux(r.sons[i], res) add(res, "\n") proc `$` *(r: TPeg): string = ## converts a PEG to its string representation result = "" toStrAux(r, result) # --------------------- core engine ------------------------------------------- type TMatchClosure {.final.} = object matches: array[0..maxSubpatterns-1, tuple[first, last: int]] ml: int when not useUnicode: type TRune = char template fastRuneAt(s, i, ch: expr) = ch = s[i] inc(i) template runeLenAt(s, i: expr): expr = 1 proc m(s: string, p: TPeg, start: int, c: var TMatchClosure): int = ## this implements a simple PEG interpreter. Thanks to superoperators it ## has competitive performance nevertheless. ## Returns -1 if it does not match, else the length of the match case p.kind of pkEmpty: result = 0 # match of length 0 of pkAny: if s[start] != '\0': result = 1 else: result = -1 of pkAnyRune: if s[start] != '\0': result = runeLenAt(s, start) else: result = -1 of pkGreedyAny: result = len(s) - start of pkNewLine: if s[start] == '\L': result = 1 elif s[start] == '\C': if s[start+1] == '\L': result = 2 else: result = 1 else: result = -1 of pkTerminal: result = len(p.term) for i in 0..result-1: if p.term[i] != s[start+i]: result = -1 break of pkTerminalIgnoreCase: var i = 0 a, b: TRune result = start while i < len(p.term): fastRuneAt(p.term, i, a) fastRuneAt(s, result, b) if toLower(a) != toLower(b): result = -1 break dec(result, start) of pkTerminalIgnoreStyle: var i = 0 a, b: TRune result = start while i < len(p.term): while true: fastRuneAt(p.term, i, a) if a != TRune('_'): break while true: fastRuneAt(s, result, b) if b != TRune('_'): break if toLower(a) != toLower(b): result = -1 break dec(result, start) of pkChar: if p.ch == s[start]: result = 1 else: result = -1 of pkCharChoice: if contains(p.charChoice^, s[start]): result = 1 else: result = -1 of pkNonTerminal: var oldMl = c.ml when false: echo "enter: ", p.nt.name result = m(s, p.nt.rule, start, c) when false: echo "leave: ", p.nt.name if result < 0: c.ml = oldMl of pkSequence: var oldMl = c.ml result = 0 for i in 0..high(p.sons): var x = m(s, p.sons[i], start+result, c) if x < 0: c.ml = oldMl result = -1 break else: inc(result, x) of pkOrderedChoice: var oldMl = c.ml for i in 0..high(p.sons): result = m(s, p.sons[i], start, c) if result >= 0: break c.ml = oldMl of pkSearch: var oldMl = c.ml result = 0 while start+result < s.len: var x = m(s, p.sons[0], start+result, c) if x >= 0: inc(result, x) return inc(result) result = -1 c.ml = oldMl of pkGreedyRep: result = 0 while true: var x = m(s, p.sons[0], start+result, c) # if x == 0, we have an endless loop; so the correct behaviour would be # not to break. But endless loops can be easily introduced: # ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the # expected thing in this case. if x <= 0: break inc(result, x) of pkGreedyRepChar: result = 0 var ch = p.ch while ch == s[start+result]: inc(result) of pkGreedyRepSet: result = 0 while contains(p.charChoice^, s[start+result]): inc(result) of pkOption: result = max(0, m(s, p.sons[0], start, c)) of pkAndPredicate: var oldMl = c.ml result = m(s, p.sons[0], start, c) if result >= 0: result = 0 # do not consume anything else: c.ml = oldMl of pkNotPredicate: var oldMl = c.ml result = m(s, p.sons[0], start, c) if result < 0: result = 0 else: c.ml = oldMl result = -1 of pkCapture: var idx = c.ml # reserve a slot for the subpattern inc(c.ml) result = m(s, p.sons[0], start, c) if result >= 0: if idx < maxSubpatterns: c.matches[idx] = (start, start+result-1) #else: silently ignore the capture else: c.ml = idx of pkRule, pkList: assert false proc match*(s: string, pattern: TPeg, matches: var openarray[string], start = 0): bool = ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and ## the captured substrings in the array ``matches``. If it does not ## match, nothing is written into ``matches`` and ``false`` is ## returned. var c: TMatchClosure result = m(s, pattern, start, c) == len(s) if result: for i in 0..c.ml-1: matches[i] = copy(s, c.matches[i][0], c.matches[i][1]) proc match*(s: string, pattern: TPeg, start = 0): bool = ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``. var c: TMatchClosure result = m(s, pattern, start, c) == len(s) proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string], start = 0): int = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. var c: TMatchClosure result = m(s, pattern, start, c) if result >= 0: for i in 0..c.ml-1: matches[i] = copy(s, c.matches[i][0], c.matches[i][1]) proc matchLen*(s: string, pattern: TPeg, start = 0): int = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. var c: TMatchClosure result = m(s, pattern, start, c) proc find*(s: string, pattern: TPeg, matches: var openarray[string], start = 0): int = ## returns the starting position of ``pattern`` in ``s`` and the captured ## substrings in the array ``matches``. If it does not match, nothing ## is written into ``matches`` and -1 is returned. for i in 0 .. s.len-1: if matchLen(s, pattern, matches, i) >= 0: return i return -1 # could also use the pattern here: (!P .)* P proc find*(s: string, pattern: TPeg, start = 0): int = ## returns the starting position of ``pattern`` in ``s``. If it does not ## match, -1 is returned. for i in 0 .. s.len-1: if matchLen(s, pattern, i) >= 0: return i return -1 template `=~`*(s: string, pattern: TPeg): expr = ## This calls ``match`` with an implicit declared ``matches`` array that ## can be used in the scope of the ``=~`` call: ## ## .. code-block:: nimrod ## ## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}": ## # matches a key=value pair: ## echo("Key: ", matches[0]) ## echo("Value: ", matches[1]) ## elif line =~ peg"\s*{'#'.*}": ## # matches a comment ## # note that the implicit ``matches`` array is different from the ## # ``matches`` array of the first branch ## echo("comment: ", matches[0]) ## else: ## echo("syntax error") ## when not definedInScope(matches): var matches: array[0..maxSubpatterns-1, string] match(s, pattern, matches) # ------------------------- more string handling ------------------------------ proc contains*(s: string, pattern: TPeg, start = 0): bool = ## same as ``find(s, pattern, start) >= 0`` return find(s, pattern, start) >= 0 proc contains*(s: string, pattern: TPeg, matches: var openArray[string], start = 0): bool = ## same as ``find(s, pattern, matches, start) >= 0`` return find(s, pattern, matches, start) >= 0 proc startsWith*(s: string, prefix: TPeg): bool = ## returns true if `s` starts with the pattern `prefix` result = matchLen(s, prefix) >= 0 proc endsWith*(s: string, suffix: TPeg): bool = ## returns true if `s` ends with the pattern `prefix` for i in 0 .. s.len-1: if matchLen(s, suffix, i) == s.len - i: return true proc replace*(s: string, sub: TPeg, by: string): string = ## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by` ## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples: ## ## .. code-block:: nimrod ## "var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") ## ## Results in: ## ## .. code-block:: nimrod ## ## "var1<-keykey; val2<-key2key2" result = "" var i = 0 var caps: array[0..maxSubpatterns-1, string] while i < s.len: var x = matchLen(s, sub, caps, i) if x <= 0: add(result, s[i]) inc(i) else: addf(result, by, caps) inc(i, x) # copy the rest: add(result, copy(s, i)) proc parallelReplace*(s: string, subs: openArray[ tuple[pattern: TPeg, repl: string]]): string = ## Returns a modified copy of `s` with the substitutions in `subs` ## applied in parallel. result = "" var i = 0 var caps: array[0..maxSubpatterns-1, string] while i < s.len: block searchSubs: for j in 0..high(subs): var x = matchLen(s, subs[j][0], caps, i) if x > 0: addf(result, subs[j][1], caps) inc(i, x) break searchSubs add(result, s[i]) inc(i) # copy the rest: add(result, copy(s, i)) proc transformFile*(infile, outfile: string, subs: openArray[tuple[pattern: TPeg, repl: string]]) = ## reads in the file `infile`, performs a parallel replacement (calls ## `parallelReplace`) and writes back to `outfile`. Calls ``quit`` if an ## error occurs. This is supposed to be used for quick scripting. var x = readFile(infile) if not isNil(x): var f: TFile if open(f, outfile, fmWrite): write(f, x.parallelReplace(subs)) close(f) else: quit("cannot open for writing: " & outfile) else: quit("cannot open for reading: " & infile) iterator split*(s: string, sep: TPeg): string = ## Splits the string `s` into substrings. ## ## Substrings are separated by the PEG `sep`. ## Examples: ## ## .. code-block:: nimrod ## for word in split("00232this02939is39an22example111", peg"\d+"): ## writeln(stdout, word) ## ## Results in: ## ## .. code-block:: nimrod ## "this" ## "is" ## "an" ## "example" ## var first = 0 last = 0 while last < len(s): var x = matchLen(s, sep, last) if x > 0: inc(last, x) first = last while last < len(s): inc(last) x = matchLen(s, sep, last) if x > 0: break if first < last: yield copy(s, first, last-1) proc split*(s: string, sep: TPeg): seq[string] {.noSideEffect.} = ## Splits the string `s` into substrings. accumulateResult(split(s, sep)) # ------------------- scanner ------------------------------------------------- type TModifier = enum modNone, modVerbatim, modIgnoreCase, modIgnoreStyle TTokKind = enum ## enumeration of all tokens tkInvalid, ## invalid token tkEof, ## end of file reached tkAny, ## . tkAnyRune, ## _ tkIdentifier, ## abc tkStringLit, ## "abc" or 'abc' tkCharSet, ## [^A-Z] tkParLe, ## '(' tkParRi, ## ')' tkCurlyLe, ## '{' tkCurlyRi, ## '}' tkArrow, ## '<-' tkBar, ## '/' tkStar, ## '*' tkPlus, ## '+' tkAmp, ## '&' tkNot, ## '!' tkOption, ## '?' tkAt, ## '@' tkBuiltin, ## \identifier tkEscaped ## \\ TToken {.final.} = object ## a token kind: TTokKind ## the type of the token modifier: TModifier literal: string ## the parsed (string) literal charset: set[char] ## if kind == tkCharSet TPegLexer = object ## the lexer object. bufpos: int ## the current position within the buffer buf: cstring ## the buffer itself LineNumber: int ## the current line number lineStart: int ## index of last line start in buffer colOffset: int ## column to add filename: string const tokKindToStr: array[TTokKind, string] = [ "invalid", "[EOF]", ".", "_", "identifier", "string literal", "character set", "(", ")", "{", "}", "<-", "/", "*", "+", "&", "!", "?", "@", "built-in", "escaped" ] proc HandleCR(L: var TPegLexer, pos: int): int = assert(L.buf[pos] == '\c') inc(L.linenumber) result = pos+1 if L.buf[result] == '\L': inc(result) L.lineStart = result proc HandleLF(L: var TPegLexer, pos: int): int = assert(L.buf[pos] == '\L') inc(L.linenumber) result = pos+1 L.lineStart = result proc init(L: var TPegLexer, input, filename: string, line = 1, col = 0) = L.buf = input L.bufpos = 0 L.lineNumber = line L.colOffset = col L.lineStart = 0 L.filename = filename proc getColumn(L: TPegLexer): int {.inline.} = result = abs(L.bufpos - L.lineStart) + L.colOffset proc getLine(L: TPegLexer): int {.inline.} = result = L.linenumber proc errorStr(L: TPegLexer, msg: string, line = -1, col = -1): string = var line = if line < 0: getLine(L) else: line var col = if col < 0: getColumn(L) else: col result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg] proc handleHexChar(c: var TPegLexer, xi: var int) = case c.buf[c.bufpos] of '0'..'9': xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('0')) inc(c.bufpos) of 'a'..'f': xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('a') + 10) inc(c.bufpos) of 'A'..'F': xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('A') + 10) inc(c.bufpos) else: nil proc getEscapedChar(c: var TPegLexer, tok: var TToken) = inc(c.bufpos) case c.buf[c.bufpos] of 'r', 'R', 'c', 'C': add(tok.literal, '\c') Inc(c.bufpos) of 'l', 'L': add(tok.literal, '\L') Inc(c.bufpos) of 'f', 'F': add(tok.literal, '\f') inc(c.bufpos) of 'e', 'E': add(tok.literal, '\e') Inc(c.bufpos) of 'a', 'A': add(tok.literal, '\a') Inc(c.bufpos) of 'b', 'B': add(tok.literal, '\b') Inc(c.bufpos) of 'v', 'V': add(tok.literal, '\v') Inc(c.bufpos) of 't', 'T': add(tok.literal, '\t') Inc(c.bufpos) of 'x', 'X': inc(c.bufpos) var xi = 0 handleHexChar(c, xi) handleHexChar(c, xi) if xi == 0: tok.kind = tkInvalid else: add(tok.literal, Chr(xi)) of '0'..'9': var val = ord(c.buf[c.bufpos]) - ord('0') Inc(c.bufpos) var i = 1 while (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}): val = val * 10 + ord(c.buf[c.bufpos]) - ord('0') inc(c.bufpos) inc(i) if val > 0 and val <= 255: add(tok.literal, chr(val)) else: tok.kind = tkInvalid of '\0'..'\31': tok.kind = tkInvalid elif c.buf[c.bufpos] in strutils.letters: tok.kind = tkInvalid else: add(tok.literal, c.buf[c.bufpos]) Inc(c.bufpos) proc skip(c: var TPegLexer) = var pos = c.bufpos var buf = c.buf while true: case buf[pos] of ' ', '\t': Inc(pos) of '#': while not (buf[pos] in {'\c', '\L', '\0'}): inc(pos) of '\c': pos = HandleCR(c, pos) buf = c.buf of '\L': pos = HandleLF(c, pos) buf = c.buf else: break # EndOfFile also leaves the loop c.bufpos = pos proc getString(c: var TPegLexer, tok: var TToken) = tok.kind = tkStringLit var pos = c.bufPos + 1 var buf = c.buf var quote = buf[pos-1] while true: case buf[pos] of '\\': c.bufpos = pos getEscapedChar(c, tok) pos = c.bufpos of '\c', '\L', '\0': tok.kind = tkInvalid break elif buf[pos] == quote: inc(pos) break else: add(tok.literal, buf[pos]) Inc(pos) c.bufpos = pos proc getCharSet(c: var TPegLexer, tok: var TToken) = tok.kind = tkCharSet tok.charset = {} var pos = c.bufPos + 1 var buf = c.buf var caret = false if buf[pos] == '^': inc(pos) caret = true while true: var ch: char case buf[pos] of ']': inc(pos) break of '\\': c.bufpos = pos getEscapedChar(c, tok) pos = c.bufpos ch = tok.literal[tok.literal.len-1] of '\C', '\L', '\0': tok.kind = tkInvalid break else: ch = buf[pos] Inc(pos) incl(tok.charset, ch) if buf[pos] == '-': if buf[pos+1] == ']': incl(tok.charset, '-') inc(pos) else: inc(pos) var ch2: char case buf[pos] of '\\': c.bufpos = pos getEscapedChar(c, tok) pos = c.bufpos ch2 = tok.literal[tok.literal.len-1] of '\C', '\L', '\0': tok.kind = tkInvalid break else: ch2 = buf[pos] Inc(pos) for i in ord(ch)+1 .. ord(ch2): incl(tok.charset, chr(i)) c.bufpos = pos if caret: tok.charset = {'\1'..'\xFF'} - tok.charset proc getSymbol(c: var TPegLexer, tok: var TToken) = var pos = c.bufpos var buf = c.buf while true: add(tok.literal, buf[pos]) Inc(pos) if buf[pos] notin strutils.IdentChars: break c.bufpos = pos tok.kind = tkIdentifier proc getBuiltin(c: var TPegLexer, tok: var TToken) = if c.buf[c.bufpos+1] in strutils.Letters: inc(c.bufpos) getSymbol(c, tok) tok.kind = tkBuiltin else: tok.kind = tkEscaped getEscapedChar(c, tok) # may set tok.kind to tkInvalid proc getTok(c: var TPegLexer, tok: var TToken) = tok.kind = tkInvalid tok.modifier = modNone setlen(tok.literal, 0) skip(c) case c.buf[c.bufpos] of '{': tok.kind = tkCurlyLe inc(c.bufpos) add(tok.literal, '{') of '}': tok.kind = tkCurlyRi inc(c.bufpos) add(tok.literal, '}') of '[': getCharset(c, tok) of '(': tok.kind = tkParLe Inc(c.bufpos) add(tok.literal, '(') of ')': tok.kind = tkParRi Inc(c.bufpos) add(tok.literal, ')') of '.': tok.kind = tkAny inc(c.bufpos) add(tok.literal, '.') of '_': tok.kind = tkAnyRune inc(c.bufpos) add(tok.literal, '_') of '\\': getBuiltin(c, tok) of '\'', '"': getString(c, tok) of '\0': tok.kind = tkEof tok.literal = "[EOF]" of 'a'..'z', 'A'..'Z', '\128'..'\255': getSymbol(c, tok) if c.buf[c.bufpos] in {'\'', '"'}: case tok.literal of "i": tok.modifier = modIgnoreCase of "y": tok.modifier = modIgnoreStyle of "v": tok.modifier = modVerbatim else: nil setLen(tok.literal, 0) getString(c, tok) if tok.modifier == modNone: tok.kind = tkInvalid of '+': tok.kind = tkPlus inc(c.bufpos) add(tok.literal, '+') of '*': tok.kind = tkStar inc(c.bufpos) add(tok.literal, '+') of '<': if c.buf[c.bufpos+1] == '-': inc(c.bufpos, 2) tok.kind = tkArrow add(tok.literal, "<-") else: add(tok.literal, '<') of '/': tok.kind = tkBar inc(c.bufpos) add(tok.literal, '/') of '?': tok.kind = tkOption inc(c.bufpos) add(tok.literal, '?') of '!': tok.kind = tkNot inc(c.bufpos) add(tok.literal, '!') of '&': tok.kind = tkAmp inc(c.bufpos) add(tok.literal, '!') of '@': tok.kind = tkAt inc(c.bufpos) add(tok.literal, '@') else: add(tok.literal, c.buf[c.bufpos]) inc(c.bufpos) proc arrowIsNextTok(c: TPegLexer): bool = # the only look ahead we need var pos = c.bufpos while c.buf[pos] in {'\t', ' '}: inc(pos) result = c.buf[pos] == '<' and c.buf[pos+1] == '-' # ----------------------------- parser ---------------------------------------- type EInvalidPeg* = object of EBase ## raised if an invalid PEG has been detected TPegParser = object of TPegLexer ## the PEG parser object tok: TToken nonterms: seq[PNonTerminal] modifier: TModifier proc getTok(p: var TPegParser) = getTok(p, p.tok) proc pegError(p: TPegParser, msg: string, line = -1, col = -1) = var e: ref EInvalidPeg new(e) e.msg = errorStr(p, msg, line, col) raise e proc eat(p: var TPegParser, kind: TTokKind) = if p.tok.kind == kind: getTok(p) else: pegError(p, tokKindToStr[kind] & " expected") proc parseExpr(p: var TPegParser): TPeg proc getNonTerminal(p: TPegParser, name: string): PNonTerminal = for i in 0..high(p.nonterms): result = p.nonterms[i] if cmpIgnoreStyle(result.name, name) == 0: return # forward reference: result = newNonTerminal(name, getLine(p), getColumn(p)) add(p.nonterms, result) proc modifiedTerm(s: string, m: TModifier): TPeg = case m of modNone, modVerbatim: result = term(s) of modIgnoreCase: result = termIgnoreCase(s) of modIgnoreStyle: result = termIgnoreStyle(s) proc primary(p: var TPegParser): TPeg = case p.tok.kind of tkAmp: getTok(p) return &primary(p) of tkNot: getTok(p) return !primary(p) of tkAt: getTok(p) return @primary(p) else: nil case p.tok.kind of tkIdentifier: if not arrowIsNextTok(p): var nt = getNonTerminal(p, p.tok.literal) incl(nt.flags, ntUsed) result = nonTerminal(nt) getTok(p) else: pegError(p, "expression expected, but found: " & p.tok.literal) of tkStringLit: var m = p.tok.modifier if m == modNone: m = p.modifier result = modifiedTerm(p.tok.literal, m) getTok(p) of tkCharSet: if '\0' in p.tok.charset: pegError(p, "binary zero ('\\0') not allowed in character class") result = charset(p.tok.charset) getTok(p) of tkParLe: getTok(p) result = parseExpr(p) eat(p, tkParRi) of tkCurlyLe: getTok(p) result = capture(parseExpr(p)) eat(p, tkCurlyRi) of tkAny: result = any() getTok(p) of tkAnyRune: result = anyRune() getTok(p) of tkBuiltin: case p.tok.literal of "n": result = newLine() of "d": result = charset({'0'..'9'}) of "D": result = charset({'\1'..'\xff'} - {'0'..'9'}) of "s": result = charset({' ', '\9'..'\13'}) of "S": result = charset({'\1'..'\xff'} - {' ', '\9'..'\13'}) of "w": result = charset({'a'..'z', 'A'..'Z', '_'}) of "W": result = charset({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_'}) of "ident": result = pegs.ident else: pegError(p, "unknown built-in: " & p.tok.literal) getTok(p) of tkEscaped: result = term(p.tok.literal[0]) getTok(p) else: pegError(p, "expression expected, but found: " & p.tok.literal) getTok(p) # we must consume a token here to prevent endless loops! while true: case p.tok.kind of tkOption: result = ?result getTok(p) of tkStar: result = *result getTok(p) of tkPlus: result = +result getTok(p) else: break proc seqExpr(p: var TPegParser): TPeg = result = primary(p) while true: case p.tok.kind of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe, tkAny, tkAnyRune, tkBuiltin, tkEscaped: result = sequence(result, primary(p)) of tkIdentifier: if not arrowIsNextTok(p): result = sequence(result, primary(p)) else: break else: break proc parseExpr(p: var TPegParser): TPeg = result = seqExpr(p) while p.tok.kind == tkBar: getTok(p) result = result / seqExpr(p) proc parseRule(p: var TPegParser): PNonTerminal = if p.tok.kind == tkIdentifier and arrowIsNextTok(p): result = getNonTerminal(p, p.tok.literal) if ntDeclared in result.flags: pegError(p, "attempt to redefine: " & result.name) result.line = getLine(p) result.col = getColumn(p) getTok(p) eat(p, tkArrow) result.rule = parseExpr(p) incl(result.flags, ntDeclared) # NOW inlining may be attempted else: pegError(p, "rule expected, but found: " & p.tok.literal) proc rawParse(p: var TPegParser): TPeg = ## parses a rule or a PEG expression if p.tok.kind == tkBuiltin: case p.tok.literal of "i": p.modifier = modIgnoreCase getTok(p) of "y": p.modifier = modIgnoreStyle getTok(p) else: nil if p.tok.kind == tkIdentifier and arrowIsNextTok(p): result = parseRule(p).rule while p.tok.kind != tkEof: discard parseRule(p) else: result = parseExpr(p) if p.tok.kind != tkEof: pegError(p, "EOF expected, but found: " & p.tok.literal) for i in 0..high(p.nonterms): var nt = p.nonterms[i] if ntDeclared notin nt.flags: pegError(p, "undeclared identifier: " & nt.name, nt.line, nt.col) elif ntUsed notin nt.flags and i > 0: pegError(p, "unused rule: " & nt.name, nt.line, nt.col) proc parsePeg*(input: string, filename = "pattern", line = 1, col = 0): TPeg = var p: TPegParser init(TPegLexer(p), input, filename, line, col) p.tok.kind = tkInvalid p.tok.modifier = modNone p.tok.literal = "" p.tok.charset = {} p.nonterms = @[] getTok(p) result = rawParse(p) proc peg*(pattern: string): TPeg = ## constructs a TPeg object from the `pattern`. The short name has been ## chosen to encourage its use as a raw string modifier:: ## ## peg"{\ident} \s* '=' \s* {.*}" result = parsePeg(pattern, "pattern") when isMainModule: assert match("(a b c)", peg"'(' @ ')'") assert match("W_HI_Le", peg"\y 'while'") assert(not match("W_HI_L", peg"\y 'while'")) assert(not match("W_HI_Le", peg"\y v'while'")) assert match("W_HI_Le", peg"y'while'") assert($ +digits == $peg"\d+") assert "0158787".match(peg"\d+") assert "ABC 0232".match(peg"\w+\s+\d+") assert "ABC".match(peg"\d+ / \w+") for word in split("00232this02939is39an22example111", peg"\d+"): writeln(stdout, word) assert matchLen("key", ident) == 3 var pattern = sequence(ident, *whitespace, term('='), *whitespace, ident) assert matchLen("key1= cal9", pattern) == 11 var ws = newNonTerminal("ws", 1, 1) ws.rule = *whitespace var expr = newNonTerminal("expr", 1, 1) expr.rule = sequence(capture(ident), *sequence( nonterminal(ws), term('+'), nonterminal(ws), nonterminal(expr))) var c: TMatchClosure var s = "a+b + c +d+e+f" assert m(s, expr.rule, 0, c) == len(s) var a = "" for i in 0..c.ml-1: a.add(copy(s, c.matches[i][0], c.matches[i][1])) assert a == "abcdef" #echo expr.rule #const filename = "lib/devel/peg/grammar.txt" #var grammar = parsePeg(newFileStream(filename, fmRead), filename) #echo "a <- [abc]*?".match(grammar) assert find("_____abc_______", term("abc")) == 5 assert match("_______ana", peg"A <- 'ana' / . A") assert match("abcs%%%", peg"A <- ..A / .A / '%'") if "abc" =~ peg"{'a'}'bc' 'xyz' / {\ident}": assert matches[0] == "abc" else: assert false var g2 = peg"""S <- A B / C D A <- 'a'+ B <- 'b'+ C <- 'c'+ D <- 'd'+ """ assert($g2 == "((A B) / (C D))") assert match("cccccdddddd", g2) assert("var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") == "var1<-keykey; var2<-key2key2") assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}") if "aaaaaa" =~ peg"'aa' !. / ({'a'})+": assert matches[0] == "a" else: assert false