Nim - This repository contains the Nim compiler, Nim's stdlib, tools, and documentation. (mirror)

	Commit message (Expand)	Author	Age	Files	Lines
*	Removed outdated bootstrapping info from install.txt	Dominik Picheta	2013-08-23	1	-43/+0
*	fixes #287; bugfix: subrange checking is performed again	Araq	2013-04-12	1	-1/+1
*	cleanup of tests; use 'koch install' instead of 'install.sh' for a bootstrapp...	Araq	2013-04-11	1	-0/+1
*	Removes executable bit for text files.	Grzegorz Adam Hankiewicz	2013-03-16	1	-0/+0
*	win64 is a supported target; bugfix: nimrod c -r on windows; stdlib uses wid...	Araq	2012-03-04	1	-2/+6
*	doc improvements; added lazarus example	Araq	2011-07-24	1	-35/+38
*	0.8.12 bugfix: some files not in download package	Araq	2011-07-11	1	-3/+19
*	bugfix: 'set' overloadable; further steps for multi threading support	Araq	2011-07-08	1	-3/+3
*	const table support	Araq	2011-04-29	1	-1/+2
*	C source regenerated	Araq	2011-04-24	1	-0/+17
*	basic continue after error works; interactive mode more useful	Araq	2011-02-13	1	-1/+1
*	documentation improvements	Araq	2011-01-31	1	-3/+3
*	fixed pango/pangoutils new wrappers	Andreas Rumpf	2010-02-26	1	-0/+0
*	continued work on html/xmlparser	rumpf_a@web.de	2010-02-14	1	-0/+0
*	better subscript overloading	rumpf_a@web.de	2010-01-03	1	-2/+2
*	fixes	Andreas Rumpf	2009-11-14	1	-1/+1
*	version 0.8.2	rumpf_a@web.de	2009-10-21	1	-5/+4
*	added tools and web dirs	Andreas Rumpf	2009-09-15	1	-5/+2
*	overload resolution for proc vars	Andreas Rumpf	2009-06-24	1	-3/+1
*	version0.7.10	Andreas Rumpf	2009-06-08	1	-0/+4
*	version 0.7.8	Andreas Rumpf	2009-05-08	1	-66/+67
*	version 0.7.4	Andreas Rumpf	2009-01-07	1	-66/+66
*	version 0.7.2	Andreas Rumpf	2008-12-12	1	-1/+2
*	version 0.7.0	Andreas Rumpf	2008-11-16	1	-39/+18
*	too many changes to list	Andreas Rumpf	2008-08-23	1	-45/+26
*	first release	Rumpf	2008-06-23	1	-21/+25
*	Initial import	Andreas Rumpf	2008-06-22	1	-0/+101

# # # Nim's Runtime Library # (c) Copyright 2012 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # ## Simple PEG (Parsing expression grammar) matching. Uses no memorization, but ## uses superoperators and symbol inlining to improve performance. Note: ## Matching performance is hopefully competitive with optimized regular ## expression engines. ## ## .. include:: ../../doc/pegdocs.txt ## include "system/inclrtl" const useUnicode = true ## change this to deactivate proper UTF-8 support import strutils, macros import std/private/decode_helpers when useUnicode: import unicode export unicode.`==` const InlineThreshold = 5 ## number of leaves; -1 to disable inlining MaxSubpatterns* = 20 ## defines the maximum number of subpatterns that ## can be captured. More subpatterns cannot be captured! type PegKind* = enum pkEmpty, pkAny, ## any character (.) pkAnyRune, ## any Unicode character (_) pkNewLine, ## CR-LF, LF, CR pkLetter, ## Unicode letter pkLower, ## Unicode lower case letter pkUpper, ## Unicode upper case letter pkTitle, ## Unicode title character pkWhitespace, ## Unicode whitespace character pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, ## single character to match pkCharChoice, pkNonTerminal, pkSequence, ## a b c ... --> Internal DSL: peg(a, b, c) pkOrderedChoice, ## a / b / ... --> Internal DSL: a / b or /[a, b, c] pkGreedyRep, ## a* --> Internal DSL: *a ## a+ --> (a a*) pkGreedyRepChar, ## x* where x is a single character (superop) pkGreedyRepSet, ## [set]* (superop) pkGreedyAny, ## .* or _* (superop) pkOption, ## a? --> Internal DSL: ?a pkAndPredicate, ## &a --> Internal DSL: &a pkNotPredicate, ## !a --> Internal DSL: !a pkCapture, ## {a} --> Internal DSL: capture(a) pkBackRef, ## $i --> Internal DSL: backref(i) pkBackRefIgnoreCase, pkBackRefIgnoreStyle, pkSearch, ## @a --> Internal DSL: !*a pkCapturedSearch, ## {@} a --> Internal DSL: !*\a pkRule, ## a <- b pkList, ## a, b pkStartAnchor ## ^ --> Internal DSL: startAnchor() NonTerminalFlag* = enum ntDeclared, ntUsed NonTerminalObj = object ## represents a non terminal symbol name: string ## the name of the symbol line: int ## line the symbol has been declared/used in col: int ## column the symbol has been declared/used in flags: set[NonTerminalFlag] ## the nonterminal's flags rule: Peg ## the rule that the symbol refers to Peg* {.shallow.} = object ## type that represents a PEG case kind: PegKind of pkEmpty..pkWhitespace: nil of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string of pkChar, pkGreedyRepChar: ch: char of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char] of pkNonTerminal: nt: NonTerminal of pkBackRef..pkBackRefIgnoreStyle: index: range[0..MaxSubpatterns] else: sons: seq[Peg] NonTerminal* = ref NonTerminalObj proc kind*(p: Peg): PegKind = p.kind ## Returns the *PegKind* of a given *Peg* object. proc term*(p: Peg): string = p.term ## Returns the *string* representation of a given *Peg* variant object ## where present. proc ch*(p: Peg): char = p.ch ## Returns the *char* representation of a given *Peg* variant object ## where present. proc charChoice*(p: Peg): ref set[char] = p.charChoice ## Returns the *charChoice* field of a given *Peg* variant object ## where present. proc nt*(p: Peg): NonTerminal = p.nt ## Returns the *NonTerminal* object of a given *Peg* variant object ## where present. proc index*(p: Peg): range[0..MaxSubpatterns] = p.index ## Returns the back-reference index of a captured sub-pattern in the ## *Captures* object for a given *Peg* variant object where present. iterator items*(p: Peg): Peg {.inline.} = ## Yields the child nodes of a *Peg* variant object where present. for s in p.sons: yield s iterator pairs*(p: Peg): (int, Peg) {.inline.} = ## Yields the indices and child nodes of a *Peg* variant object where present. for i in 0 ..< p.sons.len: yield (i, p.sons[i]) proc name*(nt: NonTerminal): string = nt.name ## Gets the name of the symbol represented by the parent *Peg* object variant ## of a given *NonTerminal*. proc line*(nt: NonTerminal): int = nt.line ## Gets the line number of the definition of the parent *Peg* object variant ## of a given *NonTerminal*. proc col*(nt: NonTerminal): int = nt.col ## Gets the column number of the definition of the parent *Peg* object variant ## of a given *NonTerminal*. proc flags*(nt: NonTerminal): set[NonTerminalFlag] = nt.flags ## Gets the *NonTerminalFlag*-typed flags field of the parent *Peg* variant ## object of a given *NonTerminal*. proc rule*(nt: NonTerminal): Peg = nt.rule ## Gets the *Peg* object representing the rule definition of the parent *Peg* ## object variant of a given *NonTerminal*. proc term*(t: string): Peg {.noSideEffect, rtl, extern: "npegs$1Str".} = ## constructs a PEG from a terminal string if t.len != 1: result = Peg(kind: pkTerminal, term: t) else: result = Peg(kind: pkChar, ch: t[0]) proc termIgnoreCase*(t: string): Peg {. noSideEffect, rtl, extern: "npegs$1".} = ## constructs a PEG from a terminal string; ignore case for matching result = Peg(kind: pkTerminalIgnoreCase, term: t) proc termIgnoreStyle*(t: string): Peg {. noSideEffect, rtl, extern: "npegs$1".} = ## constructs a PEG from a terminal string; ignore style for matching result = Peg(kind: pkTerminalIgnoreStyle, term: t) proc term*(t: char): Peg {.noSideEffect, rtl, extern: "npegs$1Char".} = ## constructs a PEG from a terminal char assert t != '\0' result = Peg(kind: pkChar, ch: t) proc charSet*(s: set[char]): Peg {.noSideEffect, rtl, extern: "npegs$1".} = ## constructs a PEG from a character set `s` assert '\0' notin s result = Peg(kind: pkCharChoice) new(result.charChoice) result.charChoice[] = s proc len(a: Peg): int {.inline.} = return a.sons.len proc add(d: var Peg, s: Peg) {.inline.} = add(d.sons, s) proc addChoice(dest: var Peg, elem: Peg) = var L = dest.len-1 if L >= 0 and dest.sons[L].kind == pkCharChoice: # caution! Do not introduce false aliasing here! case elem.kind of pkCharChoice: dest.sons[L] = charSet(dest.sons[L].charChoice[] + elem.charChoice[]) of pkChar: dest.sons[L] = charSet(dest.sons[L].charChoice[] + {elem.ch}) else: add(dest, elem) else: add(dest, elem) template multipleOp(k: PegKind, localOpt: untyped) = result = Peg(kind: k, sons: @[]) for x in items(a): if x.kind == k: for y in items(x.sons): localOpt(result, y) else: localOpt(result, x) if result.len == 1: result = result.sons[0] proc `/`*(a: varargs[Peg]): Peg {. noSideEffect, rtl, extern: "npegsOrderedChoice".} = ## constructs an ordered choice with the PEGs in `a` multipleOp(pkOrderedChoice, addChoice) proc addSequence(dest: var Peg, elem: Peg) = var L = dest.len-1 if L >= 0 and dest.sons[L].kind == pkTerminal: # caution! Do not introduce false aliasing here! case elem.kind of pkTerminal: dest.sons[L] = term(dest.sons[L].term & elem.term) of pkChar: dest.sons[L] = term(dest.sons[L].term & elem.ch) else: add(dest, elem) else: add(dest, elem) proc sequence*(a: varargs[Peg]): Peg {. noSideEffect, rtl, extern: "npegs$1".} = ## constructs a sequence with all the PEGs from `a` multipleOp(pkSequence, addSequence) proc `?`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsOptional".} = ## constructs an optional for the PEG `a` if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar, pkGreedyRepSet}: # a* ? --> a* # a? ? --> a? result = a else: result = Peg(kind: pkOption, sons: @[a]) proc `*`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsGreedyRep".} = ## constructs a "greedy repetition" for the PEG `a` case a.kind of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption: assert false # produces endless loop! of pkChar: result = Peg(kind: pkGreedyRepChar, ch: a.ch) of pkCharChoice: result = Peg(kind: pkGreedyRepSet, charChoice: a.charChoice) of pkAny, pkAnyRune: result = Peg(kind: pkGreedyAny) else: result = Peg(kind: pkGreedyRep, sons: @[a]) proc `!*`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsSearch".} = ## constructs a "search" for the PEG `a` result = Peg(kind: pkSearch, sons: @[a]) proc `!*\`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npgegsCapturedSearch".} = ## constructs a "captured search" for the PEG `a` result = Peg(kind: pkCapturedSearch, sons: @[a]) proc `+`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsGreedyPosRep".} = ## constructs a "greedy positive repetition" with the PEG `a` return sequence(a, *a) proc `&`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsAndPredicate".} = ## constructs an "and predicate" with the PEG `a` result = Peg(kind: pkAndPredicate, sons: @[a]) proc `!`*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsNotPredicate".} = ## constructs a "not predicate" with the PEG `a` result = Peg(kind: pkNotPredicate, sons: @[a]) proc any*: Peg {.inline.} = ## constructs the PEG `any character`:idx: (``.``) result = Peg(kind: pkAny) proc anyRune*: Peg {.inline.} = ## constructs the PEG `any rune`:idx: (``_``) result = Peg(kind: pkAnyRune) proc newLine*: Peg {.inline.} = ## constructs the PEG `newline`:idx: (``\n``) result = Peg(kind: pkNewLine) proc unicodeLetter*: Peg {.inline.} = ## constructs the PEG ``\letter`` which matches any Unicode letter. result = Peg(kind: pkLetter) proc unicodeLower*: Peg {.inline.} = ## constructs the PEG ``\lower`` which matches any Unicode lowercase letter. result = Peg(kind: pkLower) proc unicodeUpper*: Peg {.inline.} = ## constructs the PEG ``\upper`` which matches any Unicode uppercase letter. result = Peg(kind: pkUpper) proc unicodeTitle*: Peg {.inline.} = ## constructs the PEG ``\title`` which matches any Unicode title letter. result = Peg(kind: pkTitle) proc unicodeWhitespace*: Peg {.inline.} = ## constructs the PEG ``\white`` which matches any Unicode ## whitespace character. result = Peg(kind: pkWhitespace) proc startAnchor*: Peg {.inline.} = ## constructs the PEG ``^`` which matches the start of the input. result = Peg(kind: pkStartAnchor) proc endAnchor*: Peg {.inline.} = ## constructs the PEG ``$`` which matches the end of the input. result = !any() proc capture*(a: Peg): Peg {.noSideEffect, rtl, extern: "npegsCapture".} = ## constructs a capture with the PEG `a` result = Peg(kind: pkCapture, sons: @[a]) proc backref*(index: range[1..MaxSubpatterns]): Peg {. noSideEffect, rtl, extern: "npegs$1".} = ## constructs a back reference of the given `index`. `index` starts counting ## from 1. result = Peg(kind: pkBackRef, index: index-1) proc backrefIgnoreCase*(index: range[1..MaxSubpatterns]): Peg {. noSideEffect, rtl, extern: "npegs$1".} = ## constructs a back reference of the given `index`. `index` starts counting ## from 1. Ignores case for matching. result = Peg(kind: pkBackRefIgnoreCase, index: index-1) proc backrefIgnoreStyle*(index: range[1..MaxSubpatterns]): Peg {. noSideEffect, rtl, extern: "npegs$1".} = ## constructs a back reference of the given `index`. `index` starts counting ## from 1. Ignores style for matching. result = Peg(kind: pkBackRefIgnoreStyle, index: index-1) proc spaceCost(n: Peg): int = case n.kind of pkEmpty: discard of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, pkGreedyRepChar, pkCharChoice, pkGreedyRepSet, pkAny..pkWhitespace, pkGreedyAny: result = 1 of pkNonTerminal: # we cannot inline a rule with a non-terminal result = InlineThreshold+1 else: for i in 0..n.len-1: inc(result, spaceCost(n.sons[i])) if result >= InlineThreshold: break proc nonterminal*(n: NonTerminal): Peg {. noSideEffect, rtl, extern: "npegs$1".} = ## constructs a PEG that consists of the nonterminal symbol assert n != nil if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold: when false: echo "inlining symbol: ", n.name result = n.rule # inlining of rule enables better optimizations else: result = Peg(kind: pkNonTerminal, nt: n) proc newNonTerminal*(name: string, line, column: int): NonTerminal {. noSideEffect, rtl, extern: "npegs$1".} = ## constructs a nonterminal symbol result = NonTerminal(name: name, line: line, col: column) template letters*: Peg = ## expands to ``charset({'A'..'Z', 'a'..'z'})`` charSet({'A'..'Z', 'a'..'z'}) template digits*: Peg = ## expands to ``charset({'0'..'9'})`` charSet({'0'..'9'}) template whitespace*: Peg = ## expands to ``charset({' ', '\9'..'\13'})`` charSet({' ', '\9'..'\13'}) template identChars*: Peg = ## expands to ``charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})`` charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'}) template identStartChars*: Peg = ## expands to ``charset({'A'..'Z', 'a'..'z', '_'})`` charSet({'a'..'z', 'A'..'Z', '_'}) template ident*: Peg = ## same as ``[a-zA-Z_][a-zA-z_0-9]*``; standard identifier sequence(charSet({'a'..'z', 'A'..'Z', '_'}), *charSet({'a'..'z', 'A'..'Z', '0'..'9', '_'})) template natural*: Peg = ## same as ``\d+`` +digits # ------------------------- debugging ----------------------------------------- proc esc(c: char, reserved = {'\0'..'\255'}): string = case c of '\b': result = "\\b" of '\t': result = "\\t" of '\c': result = "\\c" of '\L': result = "\\l" of '\v': result = "\\v" of '\f': result = "\\f" of '\e': result = "\\e" of '\a': result = "\\a" of '\\': result = "\\\\" of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c elif c < ' ' or c >= '\127': result = '\\' & $ord(c) elif c in reserved: result = '\\' & c else: result = $c proc singleQuoteEsc(c: char): string = return "'" & esc(c, {'\''}) & "'" proc singleQuoteEsc(str: string): string = result = "'" for c in items(str): add result, esc(c, {'\''}) add result, '\'' proc charSetEscAux(cc: set[char]): string = const reserved = {'^', '-', ']'} result = "" var c1 = 0 while c1 <= 0xff: if chr(c1) in cc: var c2 = c1 while c2 < 0xff and chr(succ(c2)) in cc: inc(c2) if c1 == c2: add result, esc(chr(c1), reserved) elif c2 == succ(c1): add result, esc(chr(c1), reserved) & esc(chr(c2), reserved) else: add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved) c1 = c2 inc(c1) proc charSetEsc(cc: set[char]): string = if card(cc) >= 128+64: result = "[^" & charSetEscAux({'\1'..'\xFF'} - cc) & ']' else: result = '[' & charSetEscAux(cc) & ']' proc toStrAux(r: Peg, res: var string) = case r.kind of pkEmpty: add(res, "()") of pkAny: add(res, '.') of pkAnyRune: add(res, '_') of pkLetter: add(res, "\\letter") of pkLower: add(res, "\\lower") of pkUpper: add(res, "\\upper") of pkTitle: add(res, "\\title") of pkWhitespace: add(res, "\\white") of pkNewLine: add(res, "\\n") of pkTerminal: add(res, singleQuoteEsc(r.term)) of pkTerminalIgnoreCase: add(res, 'i') add(res, singleQuoteEsc(r.term)) of pkTerminalIgnoreStyle: add(res, 'y') add(res, singleQuoteEsc(r.term)) of pkChar: add(res, singleQuoteEsc(r.ch)) of pkCharChoice: add(res, charSetEsc(r.charChoice[])) of pkNonTerminal: add(res, r.nt.name) of pkSequence: add(res, '(') toStrAux(r.sons[0], res) for i in 1 .. high(r.sons): add(res, ' ') toStrAux(r.sons[i], res) add(res, ')') of pkOrderedChoice: add(res, '(') toStrAux(r.sons[0], res) for i in 1 .. high(r.sons): add(res, " / ") toStrAux(r.sons[i], res) add(res, ')') of pkGreedyRep: toStrAux(r.sons[0], res) add(res, '*') of pkGreedyRepChar: add(res, singleQuoteEsc(r.ch)) add(res, '*') of pkGreedyRepSet: add(res, charSetEsc(r.charChoice[])) add(res, '*') of pkGreedyAny: add(res, ".*") of pkOption: toStrAux(r.sons[0], res) add(res, '?') of pkAndPredicate: add(res, '&') toStrAux(r.sons[0], res) of pkNotPredicate: add(res, '!') toStrAux(r.sons[0], res) of pkSearch: add(res, '@') toStrAux(r.sons[0], res) of pkCapturedSearch: add(res, "{@}") toStrAux(r.sons[0], res) of pkCapture: add(res, '{') toStrAux(r.sons[0], res) add(res, '}') of pkBackRef: add(res, '$') add(res, $r.index) of pkBackRefIgnoreCase: add(res, "i$") add(res, $r.index) of pkBackRefIgnoreStyle: add(res, "y$") add(res, $r.index) of pkRule: toStrAux(r.sons[0], res) add(res, " <- ") toStrAux(r.sons[1], res) of pkList: for i in 0 .. high(r.sons): toStrAux(r.sons[i], res) add(res, "\n") of pkStartAnchor: add(res, '^') proc `$` *(r: Peg): string {.noSideEffect, rtl, extern: "npegsToString".} = ## converts a PEG to its string representation result = "" toStrAux(r, result) # --------------------- core engine ------------------------------------------- type Captures* = object ## contains the captured substrings. matches: array[0..MaxSubpatterns-1, tuple[first, last: int]] ml: int origStart: int proc bounds*(c: Captures, i: range[0..MaxSubpatterns-1]): tuple[first, last: int] = ## returns the bounds ``[first..last]`` of the `i`'th capture. result = c.matches[i] when not useUnicode: type Rune = char template fastRuneAt(s, i, ch) = ch = s[i] inc(i) template runeLenAt(s, i): untyped = 1 proc isAlpha(a: char): bool {.inline.} = return a in {'a'..'z', 'A'..'Z'} proc isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'} proc isLower(a: char): bool {.inline.} = return a in {'a'..'z'} proc isTitle(a: char): bool {.inline.} = return false proc isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'} template matchOrParse(mopProc: untyped) = # Used to make the main matcher proc *rawMatch* as well as event parser # procs. For the former, *enter* and *leave* event handler code generators # are provided which just return *discard*. proc mopProc(s: string, p: Peg, start: int, c: var Captures): int = proc matchBackRef(s: string, p: Peg, start: int, c: var Captures): int = # Parse handler code must run in an *of* clause of its own for each # *PegKind*, so we encapsulate the identical clause body for # *pkBackRef..pkBackRefIgnoreStyle* here. if p.index >= c.ml: return -1 var (a, b) = c.matches[p.index] var n: Peg case p.kind of pkBackRef: n = Peg(kind: pkTerminal, term: s.substr(a, b)) of pkBackRefIgnoreStyle: n = Peg(kind: pkTerminalIgnoreStyle, term: s.substr(a, b)) of pkBackRefIgnoreCase: n = Peg(kind: pkTerminalIgnoreCase, term: s.substr(a, b)) else: assert(false, "impossible case") mopProc(s, n, start, c) case p.kind of pkEmpty: enter(pkEmpty, s, p, start) result = 0 # match of length 0 leave(pkEmpty, s, p, start, result) of pkAny: enter(pkAny, s, p, start) if start < s.len: result = 1 else: result = -1 leave(pkAny, s, p, start, result) of pkAnyRune: enter(pkAnyRune, s, p, start) if start < s.len: result = runeLenAt(s, start) else: result = -1 leave(pkAnyRune, s, p, start, result) of pkLetter: enter(pkLetter, s, p, start) if start < s.len: var a: Rune result = start fastRuneAt(s, result, a) if isAlpha(a): dec(result, start) else: result = -1 else: result = -1 leave(pkLetter, s, p, start, result) of pkLower: enter(pkLower, s, p, start) if start < s.len: var a: Rune result = start fastRuneAt(s, result, a) if isLower(a): dec(result, start) else: result = -1 else: result = -1 leave(pkLower, s, p, start, result) of pkUpper: enter(pkUpper, s, p, start) if start < s.len: var a: Rune result = start fastRuneAt(s, result, a) if isUpper(a): dec(result, start) else: result = -1 else: result = -1 leave(pkUpper, s, p, start, result) of pkTitle: enter(pkTitle, s, p, start) if start < s.len: var a: Rune result = start fastRuneAt(s, result, a) if isTitle(a): dec(result, start) else: result = -1 else: result = -1 leave(pkTitle, s, p, start, result) of pkWhitespace: enter(pkWhitespace, s, p, start) if start < s.len: var a: Rune result = start fastRuneAt(s, result, a) if isWhiteSpace(a): dec(result, start) else: result = -1 else: result = -1 leave(pkWhitespace, s, p, start, result) of pkGreedyAny: enter(pkGreedyAny, s, p, start) result = len(s) - start leave(pkGreedyAny, s, p, start, result) of pkNewLine: enter(pkNewLine, s, p, start) if start < s.len and s[start] == '\L': result = 1 elif start < s.len and s[start] == '\C': if start+1 < s.len and s[start+1] == '\L': result = 2 else: result = 1 else: result = -1 leave(pkNewLine, s, p, start, result) of pkTerminal: enter(pkTerminal, s, p, start) result = len(p.term) for i in 0..result-1: if start+i >= s.len or p.term[i] != s[start+i]: result = -1 break leave(pkTerminal, s, p, start, result) of pkTerminalIgnoreCase: enter(pkTerminalIgnoreCase, s, p, start) var i = 0 a, b: Rune result = start while i < len(p.term): if result >= s.len: result = -1 break fastRuneAt(p.term, i, a) fastRuneAt(s, result, b) if toLower(a) != toLower(b): result = -1 break dec(result, start) leave(pkTerminalIgnoreCase, s, p, start, result) of pkTerminalIgnoreStyle: enter(pkTerminalIgnoreStyle, s, p, start) var i = 0 a, b: Rune result = start while i < len(p.term): while i < len(p.term): fastRuneAt(p.term, i, a) if a != Rune('_'): break while result < s.len: fastRuneAt(s, result, b) if b != Rune('_'): break if result >= s.len: if i >= p.term.len: break else: result = -1 break elif toLower(a) != toLower(b): result = -1 break dec(result, start) leave(pkTerminalIgnoreStyle, s, p, start, result) of pkChar: enter(pkChar, s, p, start) if start < s.len and p.ch == s[start]: result = 1 else: result = -1 leave(pkChar, s, p, start, result) of pkCharChoice: enter(pkCharChoice, s, p, start) if start < s.len and contains(p.charChoice[], s[start]): result = 1 else: result = -1 leave(pkCharChoice, s, p, start, result) of pkNonTerminal: enter(pkNonTerminal, s, p, start) var oldMl = c.ml when false: echo "enter: ", p.nt.name result = mopProc(s, p.nt.rule, start, c) when false: echo "leave: ", p.nt.name if result < 0: c.ml = oldMl leave(pkNonTerminal, s, p, start, result) of pkSequence: enter(pkSequence, s, p, start) var oldMl = c.ml result = 0 for i in 0..high(p.sons): var x = mopProc(s, p.sons[i], start+result, c) if x < 0: c.ml = oldMl result = -1 break else: inc(result, x) leave(pkSequence, s, p, start, result) of pkOrderedChoice: enter(pkOrderedChoice, s, p, start) var oldMl = c.ml for i in 0..high(p.sons): result = mopProc(s, p.sons[i], start, c) if result >= 0: break c.ml = oldMl leave(pkOrderedChoice, s, p, start, result) of pkSearch: enter(pkSearch, s, p, start) var oldMl = c.ml result = 0 while start+result <= s.len: var x = mopProc(s, p.sons[0], start+result, c) if x >= 0: inc(result, x) leave(pkSearch, s, p, start, result) return inc(result) result = -1 c.ml = oldMl leave(pkSearch, s, p, start, result) of pkCapturedSearch: enter(pkCapturedSearch, s, p, start) var idx = c.ml # reserve a slot for the subpattern inc(c.ml) result = 0 while start+result <= s.len: var x = mopProc(s, p.sons[0], start+result, c) if x >= 0: if idx < MaxSubpatterns: c.matches[idx] = (start, start+result-1) #else: silently ignore the capture inc(result, x) leave(pkCapturedSearch, s, p, start, result) return inc(result) result = -1 c.ml = idx leave(pkCapturedSearch, s, p, start, result) of pkGreedyRep: enter(pkGreedyRep, s, p, start) result = 0 while true: var x = mopProc(s, p.sons[0], start+result, c) # if x == 0, we have an endless loop; so the correct behaviour would be # not to break. But endless loops can be easily introduced: # ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the # expected thing in this case. if x <= 0: break inc(result, x) leave(pkGreedyRep, s, p, start, result) of pkGreedyRepChar: enter(pkGreedyRepChar, s, p, start) result = 0 var ch = p.ch while start+result < s.len and ch == s[start+result]: inc(result) leave(pkGreedyRepChar, s, p, start, result) of pkGreedyRepSet: enter(pkGreedyRepSet, s, p, start) result = 0 while start+result < s.len and contains(p.charChoice[], s[start+result]): inc(result) leave(pkGreedyRepSet, s, p, start, result) of pkOption: enter(pkOption, s, p, start) result = max(0, mopProc(s, p.sons[0], start, c)) leave(pkOption, s, p, start, result) of pkAndPredicate: enter(pkAndPredicate, s, p, start) var oldMl = c.ml result = mopProc(s, p.sons[0], start, c) if result >= 0: result = 0 # do not consume anything else: c.ml = oldMl leave(pkAndPredicate, s, p, start, result) of pkNotPredicate: enter(pkNotPredicate, s, p, start) var oldMl = c.ml result = mopProc(s, p.sons[0], start, c) if result < 0: result = 0 else: c.ml = oldMl result = -1 leave(pkNotPredicate, s, p, start, result) of pkCapture: enter(pkCapture, s, p, start) var idx = c.ml # reserve a slot for the subpattern inc(c.ml) result = mopProc(s, p.sons[0], start, c) if result >= 0: if idx < MaxSubpatterns: c.matches[idx] = (start, start+result-1) #else: silently ignore the capture else: c.ml = idx leave(pkCapture, s, p, start, result) of pkBackRef: enter(pkBackRef, s, p, start) result = matchBackRef(s, p, start, c) leave(pkBackRef, s, p, start, result) of pkBackRefIgnoreCase: enter(pkBackRefIgnoreCase, s, p, start) result = matchBackRef(s, p, start, c) leave(pkBackRefIgnoreCase, s, p, start, result) of pkBackRefIgnoreStyle: enter(pkBackRefIgnoreStyle, s, p, start) result = matchBackRef(s, p, start, c) leave(pkBackRefIgnoreStyle, s, p, start, result) of pkStartAnchor: enter(pkStartAnchor, s, p, start) if c.origStart == start: result = 0 else: result = -1 leave(pkStartAnchor, s, p, start, result) of pkRule, pkList: assert false proc rawMatch*(s: string, p: Peg, start: int, c: var Captures): int {.noSideEffect, rtl, extern: "npegs$1".} = ## low-level matching proc that implements the PEG interpreter. Use this ## for maximum efficiency (every other PEG operation ends up calling this ## proc). ## Returns -1 if it does not match, else the length of the match # Set the handler generators to produce do-nothing handlers. template enter(pk, s, p, start) = discard template leave(pk, s, p, start, length) = discard matchOrParse(matchIt) result = matchIt(s, p, start, c) macro mkHandlerTplts(handlers: untyped): untyped = # Transforms the handler spec in *handlers* into handler templates. # The AST structure of *handlers[0]*: # # .. code-block:: # StmtList # Call # Ident "pkNonTerminal" # StmtList # Call # Ident "enter" # StmtList # <handler code block> # Call # Ident "leave" # StmtList # <handler code block> # Call # Ident "pkChar" # StmtList # Call # Ident "leave" # StmtList # <handler code block> # ... proc mkEnter(hdName, body: NimNode): NimNode = template helper(hdName, body) {.dirty.} = template hdName(s, p, start) = let s {.inject.} = s let p {.inject.} = p let start {.inject.} = start body result = getAst(helper(hdName, body)) template mkLeave(hdPostf, body) {.dirty.} = # this has to be dirty to be able to capture *result* as *length* in # *leaveXX* calls. template `leave hdPostf`(s, p, start, length) = body result = newStmtList() for topCall in handlers[0]: if topCall.kind notin nnkCallKinds: error("Call syntax expected.", topCall) let pegKind = topCall[0] if pegKind.kind notin {nnkIdent, nnkSym}: error("PegKind expected.", pegKind) if 2 == topCall.len: for hdDef in topCall[1]: if hdDef.kind notin nnkCallKinds: error("Call syntax expected.", hdDef) if hdDef[0].kind notin {nnkIdent, nnkSym}: error("Handler identifier expected.", hdDef[0]) if 2 == hdDef.len: let hdPostf = substr(pegKind.strVal, 2) case hdDef[0].strVal of "enter": result.add mkEnter(newIdentNode("enter" & hdPostf), hdDef[1]) of "leave": result.add getAst(mkLeave(ident(hdPostf), hdDef[1])) else: error( "Unsupported handler identifier, expected 'enter' or 'leave'.", hdDef[0] ) template eventParser*(pegAst, handlers: untyped): (proc(s: string): int) = ## Generates an interpreting event parser *proc* according to the specified ## PEG AST and handler code blocks. The *proc* can be called with a string ## to be parsed and will execute the handler code blocks whenever their ## associated grammar element is matched. It returns -1 if the string does not ## match, else the length of the total match. The following example code ## evaluates an arithmetic expression defined by a simple PEG: ## ## .. code-block:: nim ## import std/[strutils, pegs] ## ## let ## pegAst = """ ## Expr <- Sum ## Sum <- Product (('+' / '-')Product)* ## Product <- Value (('*' / '/')Value)* ## Value <- [0-9]+ / '(' Expr ')' ## """.peg ## txt = "(5+3)/2-7*22" ## ## var ## pStack: seq[string] = @[] ## valStack: seq[float] = @[] ## opStack = "" ## let ## parseArithExpr = pegAst.eventParser: ## pkNonTerminal: ## enter: ## pStack.add p.nt.name ## leave: ## pStack.setLen pStack.high ## if length > 0: ## let matchStr = s.substr(start, start+length-1) ## case p.nt.name ## of "Value": ## try: ## valStack.add matchStr.parseFloat ## echo valStack ## except ValueError: ## discard ## of "Sum", "Product": ## try: ## let val = matchStr.parseFloat ## except ValueError: ## if valStack.len > 1 and opStack.len > 0: ## valStack[^2] = case opStack[^1] ## of '+': valStack[^2] + valStack[^1] ## of '-': valStack[^2] - valStack[^1] ## of '*': valStack[^2] * valStack[^1] ## else: valStack[^2] / valStack[^1] ## valStack.setLen valStack.high ## echo valStack ## opStack.setLen opStack.high ## echo opStack ## pkChar: ## leave: ## if length == 1 and "Value" != pStack[^1]: ## let matchChar = s[start] ## opStack.add matchChar ## echo opStack ## ## let pLen = parseArithExpr(txt) ## ## The *handlers* parameter consists of code blocks for *PegKinds*, ## which define the grammar elements of interest. Each block can contain ## handler code to be executed when the parser enters and leaves text ## matching the grammar element. An *enter* handler can access the specific ## PEG AST node being matched as *p*, the entire parsed string as *s* ## and the position of the matched text segment in *s* as *start*. A *leave* ## handler can access *p*, *s*, *start* and also the length of the matched ## text segment as *length*. For an unsuccessful match, the *enter* and ## *leave* handlers will be executed, with *length* set to -1. ## ## Symbols declared in an *enter* handler can be made visible in the ## corresponding *leave* handler by annotating them with an *inject* pragma. proc rawParse(s: string, p: Peg, start: int, c: var Captures): int {.genSym.} = # binding from *macros* bind strVal mkHandlerTplts: handlers macro enter(pegKind, s, pegNode, start: untyped): untyped = # This is called by the matcher code in *matchOrParse* at the # start of the code for a grammar element of kind *pegKind*. # Expands to a call to the handler template if one was generated # by *mkHandlerTplts*. template mkDoEnter(hdPostf, s, pegNode, start) = when declared(`enter hdPostf`): `enter hdPostf`(s, pegNode, start): else: discard let hdPostf = ident(substr(strVal(pegKind), 2)) getAst(mkDoEnter(hdPostf, s, pegNode, start)) macro leave(pegKind, s, pegNode, start, length: untyped): untyped = # Like *enter*, but called at the end of the matcher code for # a grammar element of kind *pegKind*. template mkDoLeave(hdPostf, s, pegNode, start, length) = when declared(`leave hdPostf`): `leave hdPostf`(s, pegNode, start, length): else: discard let hdPostf = ident(substr(strVal(pegKind), 2)) getAst(mkDoLeave(hdPostf, s, pegNode, start, length)) matchOrParse(parseIt) parseIt(s, p, start, c) proc parser(s: string): int {.genSym.} = # the proc to be returned var ms: array[MaxSubpatterns, (int, int)] cs = Captures(matches: ms, ml: 0, origStart: 0) rawParse(s, pegAst, 0, cs) parser template fillMatches(s, caps, c) = for k in 0..c.ml-1: let startIdx = c.matches[k][0] let endIdx = c.matches[k][1] if startIdx != -1: caps[k] = substr(s, startIdx, endIdx) else: caps[k] = "" proc matchLen*(s: string, pattern: Peg, matches: var openArray[string], start = 0): int {.noSideEffect, rtl, extern: "npegs$1Capture".} = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. var c: Captures c.origStart = start result = rawMatch(s, pattern, start, c) if result >= 0: fillMatches(s, matches, c) proc matchLen*(s: string, pattern: Peg, start = 0): int {.noSideEffect, rtl, extern: "npegs$1".} = ## the same as ``match``, but it returns the length of the match, ## if there is no match, -1 is returned. Note that a match length ## of zero can happen. It's possible that a suffix of `s` remains ## that does not belong to the match. var c: Captures c.origStart = start result = rawMatch(s, pattern, start, c) proc match*(s: string, pattern: Peg, matches: var openArray[string], start = 0): bool {.noSideEffect, rtl, extern: "npegs$1Capture".} = ## returns ``true`` if ``s[start..]`` matches the ``pattern`` and ## the captured substrings in the array ``matches``. If it does not ## match, nothing is written into ``matches`` and ``false`` is ## returned. result = matchLen(s, pattern, matches, start) != -1 proc match*(s: string, pattern: Peg, start = 0): bool {.noSideEffect, rtl, extern: "npegs$1".} = ## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``. result = matchLen(s, pattern, start) != -1 proc find*(s: string, pattern: Peg, matches: var openArray[string], start = 0): int {.noSideEffect, rtl, extern: "npegs$1Capture".} = ## returns the starting position of ``pattern`` in ``s`` and the captured ## substrings in the array ``matches``. If it does not match, nothing ## is written into ``matches`` and -1 is returned. var c: Captures c.origStart = start for i in start .. s.len-1: c.ml = 0 if rawMatch(s, pattern, i, c) >= 0: fillMatches(s, matches, c) return i return -1 # could also use the pattern here: (!P .)* P proc findBounds*(s: string, pattern: Peg, matches: var openArray[string], start = 0): tuple[first, last: int] {. noSideEffect, rtl, extern: "npegs$1Capture".} = ## returns the starting position and end position of ``pattern`` in ``s`` ## and the captured ## substrings in the array ``matches``. If it does not match, nothing ## is written into ``matches`` and (-1,0) is returned. var c: Captures c.origStart = start for i in start .. s.len-1: c.ml = 0 var L = rawMatch(s, pattern, i, c) if L >= 0: fillMatches(s, matches, c) return (i, i+L-1) return (-1, 0) proc find*(s: string, pattern: Peg, start = 0): int {.noSideEffect, rtl, extern: "npegs$1".} = ## returns the starting position of ``pattern`` in ``s``. If it does not ## match, -1 is returned. var c: Captures c.origStart = start for i in start .. s.len-1: if rawMatch(s, pattern, i, c) >= 0: return i return -1 iterator findAll*(s: string, pattern: Peg, start = 0): string = ## yields all matching *substrings* of `s` that match `pattern`. var c: Captures c.origStart = start var i = start while i < s.len: c.ml = 0 var L = rawMatch(s, pattern, i, c) if L < 0: inc(i, 1) else: yield substr(s, i, i+L-1) inc(i, L) proc findAll*(s: string, pattern: Peg, start = 0): seq[string] {. noSideEffect, rtl, extern: "npegs$1".} = ## returns all matching *substrings* of `s` that match `pattern`. ## If it does not match, @[] is returned. result = @[] for it in findAll(s, pattern, start): result.add it template `=~`*(s: string, pattern: Peg): bool = ## This calls ``match`` with an implicit declared ``matches`` array that ## can be used in the scope of the ``=~`` call: ## ## .. code-block:: nim ## ## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}": ## # matches a key=value pair: ## echo("Key: ", matches[0]) ## echo("Value: ", matches[1]) ## elif line =~ peg"\s*{'#'.*}": ## # matches a comment ## # note that the implicit ``matches`` array is different from the ## # ``matches`` array of the first branch ## echo("comment: ", matches[0]) ## else: ## echo("syntax error") ## bind MaxSubpatterns when not declaredInScope(matches): var matches {.inject.}: array[0..MaxSubpatterns-1, string] match(s, pattern, matches) # ------------------------- more string handling ------------------------------ proc contains*(s: string, pattern: Peg, start = 0): bool {. noSideEffect, rtl, extern: "npegs$1".} = ## same as ``find(s, pattern, start) >= 0`` return find(s, pattern, start) >= 0 proc contains*(s: string, pattern: Peg, matches: var openArray[string], start = 0): bool {.noSideEffect, rtl, extern: "npegs$1Capture".} = ## same as ``find(s, pattern, matches, start) >= 0`` return find(s, pattern, matches, start) >= 0 proc startsWith*(s: string, prefix: Peg, start = 0): bool {. noSideEffect, rtl, extern: "npegs$1".} = ## returns true if `s` starts with the pattern `prefix` result = matchLen(s, prefix, start) >= 0 proc endsWith*(s: string, suffix: Peg, start = 0): bool {. noSideEffect, rtl, extern: "npegs$1".} = ## returns true if `s` ends with the pattern `suffix` var c: Captures c.origStart = start for i in start .. s.len-1: if rawMatch(s, suffix, i, c) == s.len - i: return true proc replacef*(s: string, sub: Peg, by: string): string {. noSideEffect, rtl, extern: "npegs$1".} = ## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by` ## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples: ## ## .. code-block:: nim ## "var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2") ## ## Results in: ## ## .. code-block:: nim ## ## "var1<-keykey; val2<-key2key2" result = "" var i = 0 var caps: array[0..MaxSubpatterns-1, string] var c: Captures while i < s.len: c.ml = 0 var x = rawMatch(s, sub, i, c) if x <= 0: add(result, s[i]) inc(i) else: fillMatches(s, caps, c) addf(result, by, caps) inc(i, x) add(result, substr(s, i)) proc replace*(s: string, sub: Peg, by = ""): string {. noSideEffect, rtl, extern: "npegs$1".} = ## Replaces `sub` in `s` by the string `by`. Captures cannot be accessed ## in `by`. result = "" var i = 0 var c: Captures while i < s.len: var x = rawMatch(s, sub, i, c) if x <= 0: add(result, s[i]) inc(i) else: add(result, by) inc(i, x) add(result, substr(s, i)) proc parallelReplace*(s: string, subs: varargs[ tuple[pattern: Peg, repl: string]]): string {. noSideEffect, rtl, extern: "npegs$1".} = ## Returns a modified copy of `s` with the substitutions in `subs` ## applied in parallel. result = "" var i = 0 var c: Captures var caps: array[0..MaxSubpatterns-1, string] while i < s.len: block searchSubs: for j in 0..high(subs): c.ml = 0 var x = rawMatch(s, subs[j][0], i, c) if x > 0: fillMatches(s, caps, c) addf(result, subs[j][1], caps) inc(i, x) break searchSubs add(result, s[i]) inc(i) # copy the rest: add(result, substr(s, i)) proc replace*(s: string, sub: Peg, cb: proc( match: int, cnt: int, caps: openArray[string]): string): string {. rtl, extern: "npegs$1cb".} = ## Replaces `sub` in `s` by the resulting strings from the callback. ## The callback proc receives the index of the current match (starting with 0), ## the count of captures and an open array with the captures of each match. Examples: ## ## .. code-block:: nim ## ## proc handleMatches*(m: int, n: int, c: openArray[string]): string = ## result = "" ## if m > 0: ## result.add ", " ## result.add case n: ## of 2: c[0].toLower & ": '" & c[1] & "'" ## of 1: c[0].toLower & ": ''" ## else: "" ## ## let s = "Var1=key1;var2=Key2; VAR3" ## echo s.replace(peg"{\ident}('='{\ident})* ';'* \s*", handleMatches) ## ## Results in: ## ## .. code-block:: nim ## ## "var1: 'key1', var2: 'Key2', var3: ''" result = "" var i = 0 var caps: array[0..MaxSubpatterns-1, string] var c: Captures var m = 0 while i < s.len: c.ml = 0 var x = rawMatch(s, sub, i, c) if x <= 0: add(result, s[i]) inc(i) else: fillMatches(s, caps, c) add(result, cb(m, c.ml, caps)) inc(i, x) inc(m) add(result, substr(s, i)) when not defined(js): proc transformFile*(infile, outfile: string, subs: varargs[tuple[pattern: Peg, repl: string]]) {. rtl, extern: "npegs$1".} = ## reads in the file `infile`, performs a parallel replacement (calls ## `parallelReplace`) and writes back to `outfile`. Raises ``IOError`` if an ## error occurs. This is supposed to be used for quick scripting. ## ## **Note**: this proc does not exist while using the JS backend. var x = readFile(infile) writeFile(outfile, x.parallelReplace(subs)) iterator split*(s: string, sep: Peg): string = ## Splits the string `s` into substrings. ## ## Substrings are separated by the PEG `sep`. ## Examples: ## ## .. code-block:: nim ## for word in split("00232this02939is39an22example111", peg"\d+"): ## writeLine(stdout, word) ## ## Results in: ## ## .. code-block:: nim ## "this" ## "is" ## "an" ## "example" ## var c: Captures var first = 0 last = 0 while last < len(s): c.ml = 0 var x = rawMatch(s, sep, last, c) if x > 0: inc(last, x) first = last while last < len(s): inc(last) c.ml = 0 x = rawMatch(s, sep, last, c) if x > 0: break if first < last: yield substr(s, first, last-1) proc split*(s: string, sep: Peg): seq[string] {. noSideEffect, rtl, extern: "npegs$1".} = ## Splits the string `s` into substrings. result = @[] for it in split(s, sep): result.add it # ------------------- scanner ------------------------------------------------- type Modifier = enum modNone, modVerbatim, modIgnoreCase, modIgnoreStyle TokKind = enum ## enumeration of all tokens tkInvalid, ## invalid token tkEof, ## end of file reached tkAny, ## . tkAnyRune, ## _ tkIdentifier, ## abc tkStringLit, ## "abc" or 'abc' tkCharSet, ## [^A-Z] tkParLe, ## '(' tkParRi, ## ')' tkCurlyLe, ## '{' tkCurlyRi, ## '}' tkCurlyAt, ## '{@}' tkArrow, ## '<-' tkBar, ## '/' tkStar, ## '*' tkPlus, ## '+' tkAmp, ## '&' tkNot, ## '!' tkOption, ## '?' tkAt, ## '@' tkBuiltin, ## \identifier tkEscaped, ## \\ tkBackref, ## '$' tkDollar, ## '$' tkHat ## '^' Token {.final.} = object ## a token kind: TokKind ## the type of the token modifier: Modifier literal: string ## the parsed (string) literal charset: set[char] ## if kind == tkCharSet index: int ## if kind == tkBackref PegLexer {.inheritable.} = object ## the lexer object. bufpos: int ## the current position within the buffer buf: string ## the buffer itself lineNumber: int ## the current line number lineStart: int ## index of last line start in buffer colOffset: int ## column to add filename: string const tokKindToStr: array[TokKind, string] = [ "invalid", "[EOF]", ".", "_", "identifier", "string literal", "character set", "(", ")", "{", "}", "{@}", "<-", "/", "*", "+", "&", "!", "?", "@", "built-in", "escaped", "$", "$", "^" ] proc handleCR(L: var PegLexer, pos: int): int = assert(L.buf[pos] == '\c') inc(L.lineNumber) result = pos+1 if result < L.buf.len and L.buf[result] == '\L': inc(result) L.lineStart = result proc handleLF(L: var PegLexer, pos: int): int = assert(L.buf[pos] == '\L') inc(L.lineNumber) result = pos+1 L.lineStart = result proc init(L: var PegLexer, input, filename: string, line = 1, col = 0) = L.buf = input L.bufpos = 0 L.lineNumber = line L.colOffset = col L.lineStart = 0 L.filename = filename proc getColumn(L: PegLexer): int {.inline.} = result = abs(L.bufpos - L.lineStart) + L.colOffset proc getLine(L: PegLexer): int {.inline.} = result = L.lineNumber proc errorStr(L: PegLexer, msg: string, line = -1, col = -1): string = var line = if line < 0: getLine(L) else: line var col = if col < 0: getColumn(L) else: col result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg] proc getEscapedChar(c: var PegLexer, tok: var Token) = inc(c.bufpos) if c.bufpos >= len(c.buf): tok.kind = tkInvalid return case c.buf[c.bufpos] of 'r', 'R', 'c', 'C': add(tok.literal, '\c') inc(c.bufpos) of 'l', 'L': add(tok.literal, '\L') inc(c.bufpos) of 'f', 'F': add(tok.literal, '\f') inc(c.bufpos) of 'e', 'E': add(tok.literal, '\e') inc(c.bufpos) of 'a', 'A': add(tok.literal, '\a') inc(c.bufpos) of 'b', 'B': add(tok.literal, '\b') inc(c.bufpos) of 'v', 'V': add(tok.literal, '\v') inc(c.bufpos) of 't', 'T': add(tok.literal, '\t') inc(c.bufpos) of 'x', 'X': inc(c.bufpos) if c.bufpos >= len(c.buf): tok.kind = tkInvalid return var xi = 0 if handleHexChar(c.buf[c.bufpos], xi): inc(c.bufpos) if handleHexChar(c.buf[c.bufpos], xi): inc(c.bufpos) if xi == 0: tok.kind = tkInvalid else: add(tok.literal, chr(xi)) of '0'..'9': var val = ord(c.buf[c.bufpos]) - ord('0') inc(c.bufpos) var i = 1 while (c.bufpos < len(c.buf)) and (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}): val = val * 10 + ord(c.buf[c.bufpos]) - ord('0') inc(c.bufpos) inc(i) if val > 0 and val <= 255: add(tok.literal, chr(val)) else: tok.kind = tkInvalid of '\0'..'\31': tok.kind = tkInvalid elif c.buf[c.bufpos] in strutils.Letters: tok.kind = tkInvalid else: add(tok.literal, c.buf[c.bufpos]) inc(c.bufpos) proc skip(c: var PegLexer) = var pos = c.bufpos while pos < c.buf.len: case c.buf[pos] of ' ', '\t': inc(pos) of '#': while (pos < c.buf.len) and not (c.buf[pos] in {'\c', '\L', '\0'}): inc(pos) of '\c': pos = handleCR(c, pos) of '\L': pos = handleLF(c, pos) else: break # EndOfFile also leaves the loop c.bufpos = pos proc getString(c: var PegLexer, tok: var Token) = tok.kind = tkStringLit var pos = c.bufpos + 1 var quote = c.buf[pos-1] while pos < c.buf.len: case c.buf[pos] of '\\': c.bufpos = pos getEscapedChar(c, tok) pos = c.bufpos of '\c', '\L', '\0': tok.kind = tkInvalid break elif c.buf[pos] == quote: inc(pos) break else: add(tok.literal, c.buf[pos]) inc(pos) c.bufpos = pos proc getDollar(c: var PegLexer, tok: var Token) = var pos = c.bufpos + 1 if pos < c.buf.len and c.buf[pos] in {'0'..'9'}: tok.kind = tkBackref tok.index = 0 while pos < c.buf.len and c.buf[pos] in {'0'..'9'}: tok.index = tok.index * 10 + ord(c.buf[pos]) - ord('0') inc(pos) else: tok.kind = tkDollar c.bufpos = pos proc getCharSet(c: var PegLexer, tok: var Token) = tok.kind = tkCharSet tok.charset = {} var pos = c.bufpos + 1 var caret = false if pos < c.buf.len: if c.buf[pos] == '^': inc(pos) caret = true while pos < c.buf.len: var ch: char case c.buf[pos] of ']': if pos < c.buf.len: inc(pos) break of '\\': c.bufpos = pos getEscapedChar(c, tok) pos = c.bufpos ch = tok.literal[tok.literal.len-1] of '\C', '\L', '\0': tok.kind = tkInvalid break else: ch = c.buf[pos] inc(pos) incl(tok.charset, ch) if c.buf[pos] == '-': if pos+1 < c.buf.len and c.buf[pos+1] == ']': incl(tok.charset, '-') inc(pos) else: if pos+1 < c.buf.len: inc(pos) else: break var ch2: char case c.buf[pos] of '\\': c.bufpos = pos getEscapedChar(c, tok) pos = c.bufpos ch2 = tok.literal[tok.literal.len-1] of '\C', '\L', '\0': tok.kind = tkInvalid break else: if pos+1 < c.buf.len: ch2 = c.buf[pos] inc(pos) else: break for i in ord(ch)+1 .. ord(ch2): incl(tok.charset, chr(i)) c.bufpos = pos if caret: tok.charset = {'\1'..'\xFF'} - tok.charset proc getSymbol(c: var PegLexer, tok: var Token) = var pos = c.bufpos while pos < c.buf.len: add(tok.literal, c.buf[pos]) inc(pos) if pos < c.buf.len and c.buf[pos] notin strutils.IdentChars: break c.bufpos = pos tok.kind = tkIdentifier proc getBuiltin(c: var PegLexer, tok: var Token) = if c.bufpos+1 < c.buf.len and c.buf[c.bufpos+1] in strutils.Letters: inc(c.bufpos) getSymbol(c, tok) tok.kind = tkBuiltin else: tok.kind = tkEscaped getEscapedChar(c, tok) # may set tok.kind to tkInvalid proc getTok(c: var PegLexer, tok: var Token) = tok.kind = tkInvalid tok.modifier = modNone setLen(tok.literal, 0) skip(c) if c.bufpos >= c.buf.len: tok.kind = tkEof tok.literal = "[EOF]" add(tok.literal, '\0') inc(c.bufpos) return case c.buf[c.bufpos] of '{': inc(c.bufpos) if c.buf[c.bufpos] == '@' and c.bufpos+2 < c.buf.len and c.buf[c.bufpos+1] == '}': tok.kind = tkCurlyAt inc(c.bufpos, 2) add(tok.literal, "{@}") else: tok.kind = tkCurlyLe add(tok.literal, '{') of '}': tok.kind = tkCurlyRi inc(c.bufpos) add(tok.literal, '}') of '[': getCharSet(c, tok) of '(': tok.kind = tkParLe inc(c.bufpos) add(tok.literal, '(') of ')': tok.kind = tkParRi inc(c.bufpos) add(tok.literal, ')') of '.': tok.kind = tkAny inc(c.bufpos) add(tok.literal, '.') of '_': tok.kind = tkAnyRune inc(c.bufpos) add(tok.literal, '_') of '\\': getBuiltin(c, tok) of '\'', '"': getString(c, tok) of '$': getDollar(c, tok) of 'a'..'z', 'A'..'Z', '\128'..'\255': getSymbol(c, tok) if c.bufpos >= c.buf.len: return if c.buf[c.bufpos] in {'\'', '"'} or c.buf[c.bufpos] == '$' and c.bufpos+1 < c.buf.len and c.buf[c.bufpos+1] in {'0'..'9'}: case tok.literal of "i": tok.modifier = modIgnoreCase of "y": tok.modifier = modIgnoreStyle of "v": tok.modifier = modVerbatim else: discard setLen(tok.literal, 0) if c.buf[c.bufpos] == '$': getDollar(c, tok) else: getString(c, tok) if tok.modifier == modNone: tok.kind = tkInvalid of '+': tok.kind = tkPlus inc(c.bufpos) add(tok.literal, '+') of '*': tok.kind = tkStar inc(c.bufpos) add(tok.literal, '+') of '<': if c.bufpos+2 < c.buf.len and c.buf[c.bufpos+1] == '-': inc(c.bufpos, 2) tok.kind = tkArrow add(tok.literal, "<-") else: add(tok.literal, '<') of '/': tok.kind = tkBar inc(c.bufpos) add(tok.literal, '/') of '?': tok.kind = tkOption inc(c.bufpos) add(tok.literal, '?') of '!': tok.kind = tkNot inc(c.bufpos) add(tok.literal, '!') of '&': tok.kind = tkAmp inc(c.bufpos) add(tok.literal, '!') of '@': tok.kind = tkAt inc(c.bufpos) add(tok.literal, '@') if c.buf[c.bufpos] == '@': tok.kind = tkCurlyAt inc(c.bufpos) add(tok.literal, '@') of '^': tok.kind = tkHat inc(c.bufpos) add(tok.literal, '^') else: if c.bufpos >= c.buf.len: tok.kind = tkEof tok.literal = "[EOF]" add(tok.literal, c.buf[c.bufpos]) inc(c.bufpos) proc arrowIsNextTok(c: PegLexer): bool = # the only look ahead we need var pos = c.bufpos while pos < c.buf.len and c.buf[pos] in {'\t', ' '}: inc(pos) if pos+1 >= c.buf.len: return result = c.buf[pos] == '<' and c.buf[pos+1] == '-' # ----------------------------- parser ---------------------------------------- type EInvalidPeg* = object of ValueError ## raised if an invalid ## PEG has been detected PegParser = object of PegLexer ## the PEG parser object tok: Token nonterms: seq[NonTerminal] modifier: Modifier captures: int identIsVerbatim: bool skip: Peg proc pegError(p: PegParser, msg: string, line = -1, col = -1) = var e: ref EInvalidPeg new(e) e.msg = errorStr(p, msg, line, col) raise e proc getTok(p: var PegParser) = getTok(p, p.tok) if p.tok.kind == tkInvalid: pegError(p, "'" & p.tok.literal & "' is invalid token") proc eat(p: var PegParser, kind: TokKind) = if p.tok.kind == kind: getTok(p) else: pegError(p, tokKindToStr[kind] & " expected") proc parseExpr(p: var PegParser): Peg {.gcsafe.} proc getNonTerminal(p: var PegParser, name: string): NonTerminal = for i in 0..high(p.nonterms): result = p.nonterms[i] if cmpIgnoreStyle(result.name, name) == 0: return # forward reference: result = newNonTerminal(name, getLine(p), getColumn(p)) add(p.nonterms, result) proc modifiedTerm(s: string, m: Modifier): Peg = case m of modNone, modVerbatim: result = term(s) of modIgnoreCase: result = termIgnoreCase(s) of modIgnoreStyle: result = termIgnoreStyle(s) proc modifiedBackref(s: int, m: Modifier): Peg = case m of modNone, modVerbatim: result = backref(s) of modIgnoreCase: result = backrefIgnoreCase(s) of modIgnoreStyle: result = backrefIgnoreStyle(s) proc builtin(p: var PegParser): Peg = # do not use "y", "skip" or "i" as these would be ambiguous case p.tok.literal of "n": result = newLine() of "d": result = charSet({'0'..'9'}) of "D": result = charSet({'\1'..'\xff'} - {'0'..'9'}) of "s": result = charSet({' ', '\9'..'\13'}) of "S": result = charSet({'\1'..'\xff'} - {' ', '\9'..'\13'}) of "w": result = charSet({'a'..'z', 'A'..'Z', '_', '0'..'9'}) of "W": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z', '_', '0'..'9'}) of "a": result = charSet({'a'..'z', 'A'..'Z'}) of "A": result = charSet({'\1'..'\xff'} - {'a'..'z', 'A'..'Z'}) of "ident": result = pegs.ident of "letter": result = unicodeLetter() of "upper": result = unicodeUpper() of "lower": result = unicodeLower() of "title": result = unicodeTitle() of "white": result = unicodeWhitespace() else: pegError(p, "unknown built-in: " & p.tok.literal) proc token(terminal: Peg, p: PegParser): Peg = if p.skip.kind == pkEmpty: result = terminal else: result = sequence(p.skip, terminal) proc primary(p: var PegParser): Peg = case p.tok.kind of tkAmp: getTok(p) return &primary(p) of tkNot: getTok(p) return !primary(p) of tkAt: getTok(p) return !*primary(p) of tkCurlyAt: getTok(p) return !*\primary(p).token(p) else: discard case p.tok.kind of tkIdentifier: if p.identIsVerbatim: var m = p.tok.modifier if m == modNone: m = p.modifier result = modifiedTerm(p.tok.literal, m).token(p) getTok(p) elif not arrowIsNextTok(p): var nt = getNonTerminal(p, p.tok.literal) incl(nt.flags, ntUsed) result = nonterminal(nt).token(p) getTok(p) else: pegError(p, "expression expected, but found: " & p.tok.literal) of tkStringLit: var m = p.tok.modifier if m == modNone: m = p.modifier result = modifiedTerm(p.tok.literal, m).token(p) getTok(p) of tkCharSet: if '\0' in p.tok.charset: pegError(p, "binary zero ('\\0') not allowed in character class") result = charSet(p.tok.charset).token(p) getTok(p) of tkParLe: getTok(p) result = parseExpr(p) eat(p, tkParRi) of tkCurlyLe: getTok(p) result = capture(parseExpr(p)).token(p) eat(p, tkCurlyRi) inc(p.captures) of tkAny: result = any().token(p) getTok(p) of tkAnyRune: result = anyRune().token(p) getTok(p) of tkBuiltin: result = builtin(p).token(p) getTok(p) of tkEscaped: result = term(p.tok.literal[0]).token(p) getTok(p) of tkDollar: result = endAnchor() getTok(p) of tkHat: result = startAnchor() getTok(p) of tkBackref: var m = p.tok.modifier if m == modNone: m = p.modifier result = modifiedBackref(p.tok.index, m).token(p) if p.tok.index < 0 or p.tok.index > p.captures: pegError(p, "invalid back reference index: " & $p.tok.index) getTok(p) else: pegError(p, "expression expected, but found: " & p.tok.literal) getTok(p) # we must consume a token here to prevent endless loops! while true: case p.tok.kind of tkOption: result = ?result getTok(p) of tkStar: result = *result getTok(p) of tkPlus: result = +result getTok(p) else: break proc seqExpr(p: var PegParser): Peg = result = primary(p) while true: case p.tok.kind of tkAmp, tkNot, tkAt, tkStringLit, tkCharSet, tkParLe, tkCurlyLe, tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkBackref, tkHat, tkCurlyAt: result = sequence(result, primary(p)) of tkIdentifier: if not arrowIsNextTok(p): result = sequence(result, primary(p)) else: break else: break proc parseExpr(p: var PegParser): Peg = result = seqExpr(p) while p.tok.kind == tkBar: getTok(p) result = result / seqExpr(p) proc parseRule(p: var PegParser): NonTerminal = if p.tok.kind == tkIdentifier and arrowIsNextTok(p): result = getNonTerminal(p, p.tok.literal) if ntDeclared in result.flags: pegError(p, "attempt to redefine: " & result.name) result.line = getLine(p) result.col = getColumn(p) getTok(p) eat(p, tkArrow) result.rule = parseExpr(p) incl(result.flags, ntDeclared) # NOW inlining may be attempted else: pegError(p, "rule expected, but found: " & p.tok.literal) proc rawParse(p: var PegParser): Peg = ## parses a rule or a PEG expression while p.tok.kind == tkBuiltin: case p.tok.literal of "i": p.modifier = modIgnoreCase getTok(p) of "y": p.modifier = modIgnoreStyle getTok(p) of "skip": getTok(p) p.skip = ?primary(p) else: break if p.tok.kind == tkIdentifier and arrowIsNextTok(p): result = parseRule(p).rule while p.tok.kind != tkEof: discard parseRule(p) else: p.identIsVerbatim = true result = parseExpr(p) if p.tok.kind != tkEof: pegError(p, "EOF expected, but found: " & p.tok.literal) for i in 0..high(p.nonterms): var nt = p.nonterms[i] if ntDeclared notin nt.flags: pegError(p, "undeclared identifier: " & nt.name, nt.line, nt.col) elif ntUsed notin nt.flags and i > 0: pegError(p, "unused rule: " & nt.name, nt.line, nt.col) proc parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): Peg = ## constructs a Peg object from `pattern`. `filename`, `line`, `col` are ## used for error messages, but they only provide start offsets. `parsePeg` ## keeps track of line and column numbers within `pattern`. var p: PegParser init(PegLexer(p), pattern, filename, line, col) p.tok.kind = tkInvalid p.tok.modifier = modNone p.tok.literal = "" p.tok.charset = {} p.nonterms = @[] p.identIsVerbatim = false getTok(p) result = rawParse(p) proc peg*(pattern: string): Peg = ## constructs a Peg object from the `pattern`. The short name has been ## chosen to encourage its use as a raw string modifier:: ## ## peg"{\ident} \s* '=' \s* {.*}" result = parsePeg(pattern, "pattern") proc escapePeg*(s: string): string = ## escapes `s` so that it is matched verbatim when used as a peg. result = "" var inQuote = false for c in items(s): case c of '\0'..'\31', '\'', '"', '\\': if inQuote: result.add('\'') inQuote = false result.add("\\x") result.add(toHex(ord(c), 2)) else: if not inQuote: result.add('\'') inQuote = true result.add(c) if inQuote: result.add('\'') when isMainModule: proc pegsTest() = assert escapePeg("abc''def'") == r"'abc'\x27\x27'def'\x27" assert match("(a b c)", peg"'(' @ ')'") assert match("W_HI_Le", peg"\y 'while'") assert(not match("W_HI_L", peg"\y 'while'")) assert(not match("W_HI_Le", peg"\y v'while'")) assert match("W_HI_Le", peg"y'while'") assert($ +digits == $peg"\d+") assert "0158787".match(peg"\d+") assert "ABC 0232".match(peg"\w+\s+\d+") assert "ABC".match(peg"\d+ / \w+") var accum: seq[string] = @[] for word in split("00232this02939is39an22example111", peg"\d+"): accum.add(word) assert(accum == @["this", "is", "an", "example"]) assert matchLen("key", ident) == 3 var pattern = sequence(ident, *whitespace, term('='), *whitespace, ident) assert matchLen("key1= cal9", pattern) == 11 var ws = newNonTerminal("ws", 1, 1) ws.rule = *whitespace var expr = newNonTerminal("expr", 1, 1) expr.rule = sequence(capture(ident), *sequence( nonterminal(ws), term('+'), nonterminal(ws), nonterminal(expr))) var c: Captures var s = "a+b + c +d+e+f" assert rawMatch(s, expr.rule, 0, c) == len(s) var a = "" for i in 0..c.ml-1: a.add(substr(s, c.matches[i][0], c.matches[i][1])) assert a == "abcdef" #echo expr.rule #const filename = "lib/devel/peg/grammar.txt" #var grammar = parsePeg(newFileStream(filename, fmRead), filename) #echo "a <- [abc]*?".match(grammar) assert find("_____abc_______", term("abc"), 2) == 5 assert match("_______ana", peg"A <- 'ana' / . A") assert match("abcs%%%", peg"A <- ..A / .A / '%'") var matches: array[0..MaxSubpatterns-1, string] if "abc" =~ peg"{'a'}'bc' 'xyz' / {\ident}": assert matches[0] == "abc" else: assert false var g2 = peg"""S <- A B / C D A <- 'a'+ B <- 'b'+ C <- 'c'+ D <- 'd'+ """ assert($g2 == "((A B) / (C D))") assert match("cccccdddddd", g2) assert("var1=key; var2=key2".replacef(peg"{\ident}'='{\ident}", "$1<-$2$2") == "var1<-keykey; var2<-key2key2") assert("var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") == "$1<-$2$2; $1<-$2$2") assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}") if "aaaaaa" =~ peg"'aa' !. / ({'a'})+": assert matches[0] == "a" else: assert false if match("abcdefg", peg"c {d} ef {g}", matches, 2): assert matches[0] == "d" assert matches[1] == "g" else: assert false accum = @[] for x in findAll("abcdef", peg".", 3): accum.add(x) assert(accum == @["d", "e", "f"]) for x in findAll("abcdef", peg"^{.}", 3): assert x == "d" if "f(a, b)" =~ peg"{[0-9]+} / ({\ident} '(' {@} ')')": assert matches[0] == "f" assert matches[1] == "a, b" else: assert false assert match("eine übersicht und außerdem", peg"(\letter \white*)+") # ß is not a lower cased letter?! assert match("eine übersicht und auerdem", peg"(\lower \white*)+") assert match("EINE ÜBERSICHT UND AUSSERDEM", peg"(\upper \white*)+") assert(not match("456678", peg"(\letter)+")) assert("var1 = key; var2 = key2".replacef( peg"\skip(\s*) {\ident}'='{\ident}", "$1<-$2$2") == "var1<-keykey;var2<-key2key2") assert match("prefix/start", peg"^start$", 7) if "foo" =~ peg"{'a'}?.*": assert matches[0].len == 0 else: assert false if "foo" =~ peg"{''}.*": assert matches[0] == "" else: assert false if "foo" =~ peg"{'foo'}": assert matches[0] == "foo" else: assert false let empty_test = peg"^\d*" let str = "XYZ" assert(str.find(empty_test) == 0) assert(str.match(empty_test)) proc handleMatches(m: int, n: int, c: openArray[string]): string = result = "" if m > 0: result.add ", " result.add case n: of 2: toLowerAscii(c[0]) & ": '" & c[1] & "'" of 1: toLowerAscii(c[0]) & ": ''" else: "" assert("Var1=key1;var2=Key2; VAR3". replace(peg"{\ident}('='{\ident})* ';'* \s*", handleMatches) == "var1: 'key1', var2: 'Key2', var3: ''") doAssert "test1".match(peg"""{@}$""") doAssert "test2".match(peg"""{(!$ .)*} $""") pegsTest() static: pegsTest()