#
#
# Nimrod's Runtime Library
# (c) Copyright 2010 Andreas Rumpf
#
# See the file "copying.txt", included in this
# distribution, for details about the copyright.
#
## Simple PEG (Parsing expression grammar) matching. Uses no memorization, but
## uses superoperators and symbol inlining to improve performance. Note:
## Matching performance is hopefully competitive with optimized regular
## expression engines.
##
## .. include:: ../doc/pegdocs.txt
##
include "system/inclrtl"
const
useUnicode = true ## change this to deactivate proper UTF-8 support
import
strutils
when useUnicode:
import unicode
const
InlineThreshold = 5 ## number of leaves; -1 to disable inlining
MaxSubpatterns* = 10 ## defines the maximum number of subpatterns that
## can be captured. More subpatterns cannot be captured!
type
TPegKind = enum
pkEmpty,
pkAny, ## any character (.)
pkAnyRune, ## any Unicode character (_)
pkNewLine, ## CR-LF, LF, CR
pkLetter, ## Unicode letter
pkLower, ## Unicode lower case letter
pkUpper, ## Unicode upper case letter
pkTitle, ## Unicode title character
pkWhitespace, ## Unicode whitespace character
pkTerminal,
pkTerminalIgnoreCase,
pkTerminalIgnoreStyle,
pkChar, ## single character to match
pkCharChoice,
pkNonTerminal,
pkSequence, ## a b c ... --> Internal DSL: peg(a, b, c)
pkOrderedChoice, ## a / b / ... --> Internal DSL: a / b or /[a, b, c]
pkGreedyRep, ## a* --> Internal DSL: *a
## a+ --> (a a*)
pkGreedyRepChar, ## x* where x is a single character (superop)
pkGreedyRepSet, ## [set]* (superop)
pkGreedyAny, ## .* or _* (superop)
pkOption, ## a? --> Internal DSL: ?a
pkAndPredicate, ## &a --> Internal DSL: &a
pkNotPredicate, ## !a --> Internal DSL: !a
pkCapture, ## {a} --> Internal DSL: capture(a)
pkBackRef, ## $i --> Internal DSL: backref(i)
pkBackRefIgnoreCase,
pkBackRefIgnoreStyle,
pkSearch, ## @a --> Internal DSL: @a
pkCapturedSearch, ## {@} a --> Internal DSL: @@a
pkRule, ## a <- b
pkList ## a, b
TNonTerminalFlag = enum
ntDeclared, ntUsed
TNonTerminal {.final.} = object ## represents a non terminal symbol
name: string ## the name of the symbol
line: int ## line the symbol has been declared/used in
col: int ## column the symbol has been declared/used in
flags: set[TNonTerminalFlag] ## the nonterminal's flags
rule: TNode ## the rule that the symbol refers to
TNode {.final.} = object
case kind: TPegKind
of pkEmpty..pkWhitespace: nil
of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle: term: string
of pkChar, pkGreedyRepChar: ch: char
of pkCharChoice, pkGreedyRepSet: charChoice: ref set[char]
of pkNonTerminal: nt: PNonTerminal
of pkBackRef..pkBackRefIgnoreStyle: index: range[1..MaxSubpatterns]
else: sons: seq[TNode]
PNonTerminal* = ref TNonTerminal
TPeg* = TNode ## type that represents a PEG
proc term*(t: string): TPeg {.nosideEffect, rtl, extern: "npegs$1Str".} =
## constructs a PEG from a terminal string
if t.len != 1:
result.kind = pkTerminal
result.term = t
else:
result.kind = pkChar
result.ch = t[0]
proc termIgnoreCase*(t: string): TPeg {.
nosideEffect, rtl, extern: "npegs$1".} =
## constructs a PEG from a terminal string; ignore case for matching
result.kind = pkTerminalIgnoreCase
result.term = t
proc termIgnoreStyle*(t: string): TPeg {.
nosideEffect, rtl, extern: "npegs$1".} =
## constructs a PEG from a terminal string; ignore style for matching
result.kind = pkTerminalIgnoreStyle
result.term = t
proc term*(t: char): TPeg {.nosideEffect, rtl, extern: "npegs$1Char".} =
## constructs a PEG from a terminal char
assert t != '\0'
result.kind = pkChar
result.ch = t
proc charSet*(s: set[char]): TPeg {.nosideEffect, rtl, extern: "npegs$1".} =
## constructs a PEG from a character set `s`
assert '\0' notin s
result.kind = pkCharChoice
new(result.charChoice)
result.charChoice^ = s
proc len(a: TPeg): int {.inline.} = return a.sons.len
proc add(d: var TPeg, s: TPeg) {.inline.} = add(d.sons, s)
proc addChoice(dest: var TPeg, elem: TPeg) =
var L = dest.len-1
if L >= 0 and dest.sons[L].kind == pkCharChoice:
case elem.kind
of pkCharChoice:
dest.sons[L].charChoice^ = dest.sons[L].charChoice^ + elem.charChoice^
of pkChar: incl(dest.sons[L].charChoice^, elem.ch)
else: add(dest, elem)
else: add(dest, elem)
template multipleOp(k: TPegKind, localOpt: expr) =
result.kind = k
result.sons = @[]
for x in items(a):
if x.kind == k:
for y in items(x.sons):
localOpt(result, y)
else:
localOpt(result, x)
if result.len == 1:
result = result.sons[0]
proc `/`*(a: openArray[TPeg]): TPeg {.
nosideEffect, rtl, extern: "npegsOrderedChoice".} =
## constructs an ordered choice with the PEGs in `a`
multipleOp(pkOrderedChoice, addChoice)
proc addSequence(dest: var TPeg, elem: TPeg) =
var L = dest.len-1
if L >= 0 and dest.sons[L].kind == pkTerminal:
case elem.kind
of pkTerminal: add(dest.sons[L].term, elem.term)
of pkChar: add(dest.sons[L].term, elem.ch)
else: add(dest, elem)
else: add(dest, elem)
proc sequence*(a: openArray[TPeg]): TPeg {.
nosideEffect, rtl, extern: "npegs$1".} =
## constructs a sequence with all the PEGs from `a`
multipleOp(pkSequence, addSequence)
proc `?`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsOptional".} =
## constructs an optional for the PEG `a`
if a.kind in {pkOption, pkGreedyRep, pkGreedyAny, pkGreedyRepChar,
pkGreedyRepSet}:
# a* ? --> a*
# a? ? --> a?
result = a
else:
result.kind = pkOption
result.sons = @[a]
proc `*`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsGreedyRep".} =
## constructs a "greedy repetition" for the PEG `a`
case a.kind
of pkGreedyRep, pkGreedyRepChar, pkGreedyRepSet, pkGreedyAny, pkOption:
assert false
# produces endless loop!
of pkChar:
result.kind = pkGreedyRepChar
result.ch = a.ch
of pkCharChoice:
result.kind = pkGreedyRepSet
result.charChoice = a.charChoice # copying a reference suffices!
of pkAny, pkAnyRune:
result.kind = pkGreedyAny
else:
result.kind = pkGreedyRep
result.sons = @[a]
proc `@`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsSearch".} =
## constructs a "search" for the PEG `a`
result.kind = pkSearch
result.sons = @[a]
proc `@@`*(a: TPeg): TPeg {.noSideEffect, rtl,
extern: "npgegsCapturedSearch".} =
## constructs a "captured search" for the PEG `a`
result.kind = pkCapturedSearch
result.sons = @[a]
when false:
proc contains(a: TPeg, k: TPegKind): bool =
if a.kind == k: return true
case a.kind
of pkEmpty, pkAny, pkAnyRune, pkGreedyAny, pkNewLine, pkTerminal,
pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar, pkGreedyRepChar,
pkCharChoice, pkGreedyRepSet: nil
of pkNonTerminal: return true
else:
for i in 0..a.sons.len-1:
if contains(a.sons[i], k): return true
proc `+`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsGreedyPosRep".} =
## constructs a "greedy positive repetition" with the PEG `a`
return sequence(a, *a)
proc `&`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsAndPredicate".} =
## constructs an "and predicate" with the PEG `a`
result.kind = pkAndPredicate
result.sons = @[a]
proc `!`*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsNotPredicate".} =
## constructs a "not predicate" with the PEG `a`
result.kind = pkNotPredicate
result.sons = @[a]
proc any*: TPeg {.inline.} =
## constructs the PEG `any character`:idx: (``.``)
result.kind = pkAny
proc anyRune*: TPeg {.inline.} =
## constructs the PEG `any rune`:idx: (``_``)
result.kind = pkAnyRune
proc newLine*: TPeg {.inline.} =
## constructs the PEG `newline`:idx: (``\n``)
result.kind = pkNewline
proc UnicodeLetter*: TPeg {.inline.} =
## constructs the PEG ``\letter`` which matches any Unicode letter.
result.kind = pkLetter
proc UnicodeLower*: TPeg {.inline.} =
## constructs the PEG ``\lower`` which matches any Unicode lowercase letter.
result.kind = pkLower
proc UnicodeUpper*: TPeg {.inline.} =
## constructs the PEG ``\upper`` which matches any Unicode lowercase letter.
result.kind = pkUpper
proc UnicodeTitle*: TPeg {.inline.} =
## constructs the PEG ``\title`` which matches any Unicode title letter.
result.kind = pkTitle
proc UnicodeWhitespace*: TPeg {.inline.} =
## constructs the PEG ``\white`` which matches any Unicode
## whitespace character.
result.kind = pkWhitespace
proc capture*(a: TPeg): TPeg {.nosideEffect, rtl, extern: "npegsCapture".} =
## constructs a capture with the PEG `a`
result.kind = pkCapture
result.sons = @[a]
proc backref*(index: range[1..MaxSubPatterns]): TPeg {.
nosideEffect, rtl, extern: "npegs$1".} =
## constructs a back reference of the given `index`. `index` starts counting
## from 1.
result.kind = pkBackRef
result.index = index-1
proc backrefIgnoreCase*(index: range[1..MaxSubPatterns]): TPeg {.
nosideEffect, rtl, extern: "npegs$1".} =
## constructs a back reference of the given `index`. `index` starts counting
## from 1. Ignores case for matching.
result.kind = pkBackRefIgnoreCase
result.index = index-1
proc backrefIgnoreStyle*(index: range[1..MaxSubPatterns]): TPeg {.
nosideEffect, rtl, extern: "npegs$1".}=
## constructs a back reference of the given `index`. `index` starts counting
## from 1. Ignores style for matching.
result.kind = pkBackRefIgnoreStyle
result.index = index-1
proc spaceCost(n: TPeg): int =
case n.kind
of pkEmpty: nil
of pkTerminal, pkTerminalIgnoreCase, pkTerminalIgnoreStyle, pkChar,
pkGreedyRepChar, pkCharChoice, pkGreedyRepSet,
pkAny..pkWhitespace, pkGreedyAny:
result = 1
of pkNonTerminal:
# we cannot inline a rule with a non-terminal
result = InlineThreshold+1
else:
for i in 0..n.len-1:
inc(result, spaceCost(n.sons[i]))
if result >= InlineThreshold: break
proc nonterminal*(n: PNonTerminal): TPeg {.
nosideEffect, rtl, extern: "npegs$1".} =
## constructs a PEG that consists of the nonterminal symbol
assert n != nil
if ntDeclared in n.flags and spaceCost(n.rule) < InlineThreshold:
when false: echo "inlining symbol: ", n.name
result = n.rule # inlining of rule enables better optimizations
else:
result.kind = pkNonTerminal
result.nt = n
proc newNonTerminal*(name: string, line, column: int): PNonTerminal {.
nosideEffect, rtl, extern: "npegs$1".} =
## constructs a nonterminal symbol
new(result)
result.name = name
result.line = line
result.col = column
template letters*: expr =
## expands to ``charset({'A'..'Z', 'a'..'z'})``
charset({'A'..'Z', 'a'..'z'})
template digits*: expr =
## expands to ``charset({'0'..'9'})``
charset({'0'..'9'})
template whitespace*: expr =
## expands to ``charset({' ', '\9'..'\13'})``
charset({' ', '\9'..'\13'})
template identChars*: expr =
## expands to ``charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})``
charset({'a'..'z', 'A'..'Z', '0'..'9', '_'})
template identStartChars*: expr =
## expands to ``charset({'A'..'Z', 'a'..'z', '_'})``
charset({'a'..'z', 'A'..'Z', '_'})
template ident*: expr =
## same as ``[a-zA-Z_][a-zA-z_0-9]*``; standard identifier
sequence(charset({'a'..'z', 'A'..'Z', '_'}),
*charset({'a'..'z', 'A'..'Z', '0'..'9', '_'}))
template natural*: expr =
## same as ``\d+``
+digits
# ------------------------- debugging -----------------------------------------
proc esc(c: char, reserved = {'\0'..'\255'}): string =
case c
of '\b': result = "\\b"
of '\t': result = "\\t"
of '\c': result = "\\c"
of '\L': result = "\\l"
of '\v': result = "\\v"
of '\f': result = "\\f"
of '\e': result = "\\e"
of '\a': result = "\\a"
of '\\': result = "\\\\"
of 'a'..'z', 'A'..'Z', '0'..'9', '_': result = $c
elif c < ' ' or c >= '\128': result = '\\' & $ord(c)
elif c in reserved: result = '\\' & c
else: result = $c
proc singleQuoteEsc(c: Char): string = return "'" & esc(c, {'\''}) & "'"
proc singleQuoteEsc(str: string): string =
result = "'"
for c in items(str): add result, esc(c, {'\''})
add result, '\''
proc charSetEscAux(cc: set[char]): string =
const reserved = {'^', '-', ']'}
result = ""
var c1 = 0
while c1 <= 0xff:
if chr(c1) in cc:
var c2 = c1
while c2 < 0xff and chr(succ(c2)) in cc: inc(c2)
if c1 == c2:
add result, esc(chr(c1), reserved)
elif c2 == succ(c1):
add result, esc(chr(c1), reserved) & esc(chr(c2), reserved)
else:
add result, esc(chr(c1), reserved) & '-' & esc(chr(c2), reserved)
c1 = c2
inc(c1)
proc CharSetEsc(cc: set[char]): string =
if card(cc) >= 128+64:
result = "[^" & CharSetEscAux({'\1'..'\xFF'} - cc) & ']'
else:
result = '[' & CharSetEscAux(cc) & ']'
proc toStrAux(r: TPeg, res: var string) =
case r.kind
of pkEmpty: add(res, "()")
of pkAny: add(res, '.')
of pkAnyRune: add(res, '_')
of pkLetter: add(res, "\\letter")
of pkLower: add(res, "\\lower")
of pkUpper: add(res, "\\upper")
of pkTitle: add(res, "\\title")
of pkWhitespace: add(res, "\\white")
of pkNewline: add(res, "\\n")
of pkTerminal: add(res, singleQuoteEsc(r.term))
of pkTerminalIgnoreCase:
add(res, 'i')
add(res, singleQuoteEsc(r.term))
of pkTerminalIgnoreStyle:
add(res, 'y')
add(res, singleQuoteEsc(r.term))
of pkChar: add(res, singleQuoteEsc(r.ch))
of pkCharChoice: add(res, charSetEsc(r.charChoice^))
of pkNonTerminal: add(res, r.nt.name)
of pkSequence:
add(res, '(')
toStrAux(r.sons[0], res)
for i in 1 .. high(r.sons):
add(res, ' ')
toStrAux(r.sons[i], res)
add(res, ')')
of pkOrderedChoice:
add(res, '(')
toStrAux(r.sons[0], res)
for i in 1 .. high(r.sons):
add(res, " / ")
toStrAux(r.sons[i], res)
add(res, ')')
of pkGreedyRep:
toStrAux(r.sons[0], res)
add(res, '*')
of pkGreedyRepChar:
add(res, singleQuoteEsc(r.ch))
add(res, '*')
of pkGreedyRepSet:
add(res, charSetEsc(r.charChoice^))
add(res, '*')
of pkGreedyAny:
add(res, ".*")
of pkOption:
toStrAux(r.sons[0], res)
add(res, '?')
of pkAndPredicate:
add(res, '&')
toStrAux(r.sons[0], res)
of pkNotPredicate:
add(res, '!')
toStrAux(r.sons[0], res)
of pkSearch:
add(res, '@')
toStrAux(r.sons[0], res)
of pkCapturedSearch:
add(res, "{@}")
toStrAux(r.sons[0], res)
of pkCapture:
add(res, '{')
toStrAux(r.sons[0], res)
add(res, '}')
of pkBackRef:
add(res, '$')
add(res, $r.index)
of pkBackRefIgnoreCase:
add(res, "i$")
add(res, $r.index)
of pkBackRefIgnoreStyle:
add(res, "y$")
add(res, $r.index)
of pkRule:
toStrAux(r.sons[0], res)
add(res, " <- ")
toStrAux(r.sons[1], res)
of pkList:
for i in 0 .. high(r.sons):
toStrAux(r.sons[i], res)
add(res, "\n")
proc `$` *(r: TPeg): string {.nosideEffect, rtl, extern: "npegsToString".} =
## converts a PEG to its string representation
result = ""
toStrAux(r, result)
# --------------------- core engine -------------------------------------------
type
TCaptures* {.final.} = object ## contains the captured substrings.
matches: array[0..maxSubpatterns-1, tuple[first, last: int]]
ml: int
proc bounds*(c: TCaptures,
i: range[0..maxSubpatterns-1]): tuple[first, last: int] =
## returns the bounds ``[first..last]`` of the `i`'th capture.
result = c.matches[i]
when not useUnicode:
type
TRune = char
template fastRuneAt(s, i, ch: expr) =
ch = s[i]
inc(i)
template runeLenAt(s, i: expr): expr = 1
proc isAlpha(a: char): bool {.inline.} = return a in {'a'..'z','A'..'Z'}
proc isUpper(a: char): bool {.inline.} = return a in {'A'..'Z'}
proc isLower(a: char): bool {.inline.} = return a in {'a'..'z'}
proc isTitle(a: char): bool {.inline.} = return false
proc isWhiteSpace(a: char): bool {.inline.} = return a in {' ', '\9'..'\13'}
proc rawMatch*(s: string, p: TPeg, start: int, c: var TCaptures): int {.
nosideEffect, rtl, extern: "npegs$1".} =
## low-level matching proc that implements the PEG interpreter. Use this
## for maximum efficiency (every other PEG operation ends up calling this
## proc).
## Returns -1 if it does not match, else the length of the match
case p.kind
of pkEmpty: result = 0 # match of length 0
of pkAny:
if s[start] != '\0': result = 1
else: result = -1
of pkAnyRune:
if s[start] != '\0':
result = runeLenAt(s, start)
else:
result = -1
of pkLetter:
if s[start] != '\0':
var a: TRune
result = start
fastRuneAt(s, result, a)
if isAlpha(a): dec(result, start)
else: result = -1
else:
result = -1
of pkLower:
if s[start] != '\0':
var a: TRune
result = start
fastRuneAt(s, result, a)
if isLower(a): dec(result, start)
else: result = -1
else:
result = -1
of pkUpper:
if s[start] != '\0':
var a: TRune
result = start
fastRuneAt(s, result, a)
if isUpper(a): dec(result, start)
else: result = -1
else:
result = -1
of pkTitle:
if s[start] != '\0':
var a: TRune
result = start
fastRuneAt(s, result, a)
if isTitle(a): dec(result, start)
else: result = -1
else:
result = -1
of pkWhitespace:
if s[start] != '\0':
var a: TRune
result = start
fastRuneAt(s, result, a)
if isWhitespace(a): dec(result, start)
else: result = -1
else:
result = -1
of pkGreedyAny:
result = len(s) - start
of pkNewLine:
if s[start] == '\L': result = 1
elif s[start] == '\C':
if s[start+1] == '\L': result = 2
else: result = 1
else: result = -1
of pkTerminal:
result = len(p.term)
for i in 0..result-1:
if p.term[i] != s[start+i]:
result = -1
break
of pkTerminalIgnoreCase:
var
i = 0
a, b: TRune
result = start
while i < len(p.term):
fastRuneAt(p.term, i, a)
fastRuneAt(s, result, b)
if toLower(a) != toLower(b):
result = -1
break
dec(result, start)
of pkTerminalIgnoreStyle:
var
i = 0
a, b: TRune
result = start
while i < len(p.term):
while true:
fastRuneAt(p.term, i, a)
if a != TRune('_'): break
while true:
fastRuneAt(s, result, b)
if b != TRune('_'): break
if toLower(a) != toLower(b):
result = -1
break
dec(result, start)
of pkChar:
if p.ch == s[start]: result = 1
else: result = -1
of pkCharChoice:
if contains(p.charChoice^, s[start]): result = 1
else: result = -1
of pkNonTerminal:
var oldMl = c.ml
when false: echo "enter: ", p.nt.name
result = rawMatch(s, p.nt.rule, start, c)
when false: echo "leave: ", p.nt.name
if result < 0: c.ml = oldMl
of pkSequence:
var oldMl = c.ml
result = 0
for i in 0..high(p.sons):
var x = rawMatch(s, p.sons[i], start+result, c)
if x < 0:
c.ml = oldMl
result = -1
break
else: inc(result, x)
of pkOrderedChoice:
var oldMl = c.ml
for i in 0..high(p.sons):
result = rawMatch(s, p.sons[i], start, c)
if result >= 0: break
c.ml = oldMl
of pkSearch:
var oldMl = c.ml
result = 0
while start+result < s.len:
var x = rawMatch(s, p.sons[0], start+result, c)
if x >= 0:
inc(result, x)
return
inc(result)
result = -1
c.ml = oldMl
of pkCapturedSearch:
var idx = c.ml # reserve a slot for the subpattern
inc(c.ml)
result = 0
while start+result < s.len:
var x = rawMatch(s, p.sons[0], start+result, c)
if x >= 0:
if idx < maxSubpatterns:
c.matches[idx] = (start, start+result-1)
#else: silently ignore the capture
inc(result, x)
return
inc(result)
result = -1
c.ml = idx
of pkGreedyRep:
result = 0
while true:
var x = rawMatch(s, p.sons[0], start+result, c)
# if x == 0, we have an endless loop; so the correct behaviour would be
# not to break. But endless loops can be easily introduced:
# ``(comment / \w*)*`` is such an example. Breaking for x == 0 does the
# expected thing in this case.
if x <= 0: break
inc(result, x)
of pkGreedyRepChar:
result = 0
var ch = p.ch
while ch == s[start+result]: inc(result)
of pkGreedyRepSet:
result = 0
while contains(p.charChoice^, s[start+result]): inc(result)
of pkOption:
result = max(0, rawMatch(s, p.sons[0], start, c))
of pkAndPredicate:
var oldMl = c.ml
result = rawMatch(s, p.sons[0], start, c)
if result >= 0: result = 0 # do not consume anything
else: c.ml = oldMl
of pkNotPredicate:
var oldMl = c.ml
result = rawMatch(s, p.sons[0], start, c)
if result < 0: result = 0
else:
c.ml = oldMl
result = -1
of pkCapture:
var idx = c.ml # reserve a slot for the subpattern
inc(c.ml)
result = rawMatch(s, p.sons[0], start, c)
if result >= 0:
if idx < maxSubpatterns:
c.matches[idx] = (start, start+result-1)
#else: silently ignore the capture
else:
c.ml = idx
of pkBackRef..pkBackRefIgnoreStyle:
if p.index >= c.ml: return -1
var (a, b) = c.matches[p.index]
var n: TPeg
n.kind = succ(pkTerminal, ord(p.kind)-ord(pkBackRef))
n.term = s.copy(a, b)
result = rawMatch(s, n, start, c)
of pkRule, pkList: assert false
proc match*(s: string, pattern: TPeg, matches: var openarray[string],
start = 0): bool {.nosideEffect, rtl, extern: "npegs$1Capture".} =
## returns ``true`` if ``s[start..]`` matches the ``pattern`` and
## the captured substrings in the array ``matches``. If it does not
## match, nothing is written into ``matches`` and ``false`` is
## returned.
var c: TCaptures
result = rawMatch(s, pattern, start, c) == len(s) -start
if result:
for i in 0..c.ml-1:
matches[i] = copy(s, c.matches[i][0], c.matches[i][1])
proc match*(s: string, pattern: TPeg,
start = 0): bool {.nosideEffect, rtl, extern: "npegs$1".} =
## returns ``true`` if ``s`` matches the ``pattern`` beginning from ``start``.
var c: TCaptures
result = rawMatch(s, pattern, start, c) == len(s)-start
proc matchLen*(s: string, pattern: TPeg, matches: var openarray[string],
start = 0): int {.nosideEffect, rtl, extern: "npegs$1Capture".} =
## the same as ``match``, but it returns the length of the match,
## if there is no match, -1 is returned. Note that a match length
## of zero can happen. It's possible that a suffix of `s` remains
## that does not belong to the match.
var c: TCaptures
result = rawMatch(s, pattern, start, c)
if result >= 0:
for i in 0..c.ml-1:
matches[i] = copy(s, c.matches[i][0], c.matches[i][1])
proc matchLen*(s: string, pattern: TPeg,
start = 0): int {.nosideEffect, rtl, extern: "npegs$1".} =
## the same as ``match``, but it returns the length of the match,
## if there is no match, -1 is returned. Note that a match length
## of zero can happen. It's possible that a suffix of `s` remains
## that does not belong to the match.
var c: TCaptures
result = rawMatch(s, pattern, start, c)
proc find*(s: string, pattern: TPeg, matches: var openarray[string],
start = 0): int {.nosideEffect, rtl, extern: "npegs$1Capture".} =
## returns the starting position of ``pattern`` in ``s`` and the captured
## substrings in the array ``matches``. If it does not match, nothing
## is written into ``matches`` and -1 is returned.
for i in start .. s.len-1:
if matchLen(s, pattern, matches, i) >= 0: return i
return -1
# could also use the pattern here: (!P .)* P
proc findBounds*(s: string, pattern: TPeg, matches: var openarray[string],
start = 0): tuple[first, last: int] {.
nosideEffect, rtl, extern: "npegs$1Capture".} =
## returns the starting position and end position of ``pattern`` in ``s``
## and the captured
## substrings in the array ``matches``. If it does not match, nothing
## is written into ``matches`` and (-1,0) is returned.
for i in start .. s.len-1:
var L = matchLen(s, pattern, matches, i)
if L >= 0: return (i, i+L-1)
return (-1, 0)
proc find*(s: string, pattern: TPeg,
start = 0): int {.nosideEffect, rtl, extern: "npegs$1".} =
## returns the starting position of ``pattern`` in ``s``. If it does not
## match, -1 is returned.
for i in start .. s.len-1:
if matchLen(s, pattern, i) >= 0: return i
return -1
iterator findAll*(s: string, pattern: TPeg, start = 0): string =
## yields all matching captures of pattern in `s`.
var matches: array[0..MaxSubpatterns-1, string]
var i = start
while i < s.len:
var L = matchLen(s, pattern, matches, i)
if L < 0: break
for k in 0..maxSubPatterns-1:
if isNil(matches[k]): break
yield matches[k]
inc(i, L)
proc findAll*(s: string, pattern: TPeg, start = 0): seq[string] {.
nosideEffect, rtl, extern: "npegs$1".} =
## returns all matching captures of pattern in `s`.
## If it does not match, @[] is returned.
accumulateResult(findAll(s, pattern, start))
template `=~`*(s: string, pattern: TPeg): expr =
## This calls ``match`` with an implicit declared ``matches`` array that
## can be used in the scope of the ``=~`` call:
##
## .. code-block:: nimrod
##
## if line =~ peg"\s* {\w+} \s* '=' \s* {\w+}":
## # matches a key=value pair:
## echo("Key: ", matches[0])
## echo("Value: ", matches[1])
## elif line =~ peg"\s*{'#'.*}":
## # matches a comment
## # note that the implicit ``matches`` array is different from the
## # ``matches`` array of the first branch
## echo("comment: ", matches[0])
## else:
## echo("syntax error")
##
when not definedInScope(matches):
var matches: array[0..maxSubpatterns-1, string]
match(s, pattern, matches)
# ------------------------- more string handling ------------------------------
proc contains*(s: string, pattern: TPeg, start = 0): bool {.
nosideEffect, rtl, extern: "npegs$1".} =
## same as ``find(s, pattern, start) >= 0``
return find(s, pattern, start) >= 0
proc contains*(s: string, pattern: TPeg, matches: var openArray[string],
start = 0): bool {.nosideEffect, rtl, extern: "npegs$1Capture".} =
## same as ``find(s, pattern, matches, start) >= 0``
return find(s, pattern, matches, start) >= 0
proc startsWith*(s: string, prefix: TPeg, start = 0): bool {.
nosideEffect, rtl, extern: "npegs$1".} =
## returns true if `s` starts with the pattern `prefix`
result = matchLen(s, prefix, start) >= 0
proc endsWith*(s: string, suffix: TPeg, start = 0): bool {.
nosideEffect, rtl, extern: "npegs$1".} =
## returns true if `s` ends with the pattern `prefix`
for i in start .. s.len-1:
if matchLen(s, suffix, i) == s.len - i: return true
proc replace*(s: string, sub: TPeg, by: string): string {.
nosideEffect, rtl, extern: "npegs$1".} =
## Replaces `sub` in `s` by the string `by`. Captures can be accessed in `by`
## with the notation ``$i`` and ``$#`` (see strutils.`%`). Examples:
##
## .. code-block:: nimrod
## "var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2")
##
## Results in:
##
## .. code-block:: nimrod
##
## "var1<-keykey; val2<-key2key2"
result = ""
var i = 0
var caps: array[0..maxSubpatterns-1, string]
while i < s.len:
var x = matchLen(s, sub, caps, i)
if x <= 0:
add(result, s[i])
inc(i)
else:
addf(result, by, caps)
inc(i, x)
# copy the rest:
add(result, copy(s, i))
proc parallelReplace*(s: string, subs: openArray[
tuple[pattern: TPeg, repl: string]]): string {.
nosideEffect, rtl, extern: "npegs$1".} =
## Returns a modified copy of `s` with the substitutions in `subs`
## applied in parallel.
result = ""
var i = 0
var caps: array[0..maxSubpatterns-1, string]
while i < s.len:
block searchSubs:
for j in 0..high(subs):
var x = matchLen(s, subs[j][0], caps, i)
if x > 0:
addf(result, subs[j][1], caps)
inc(i, x)
break searchSubs
add(result, s[i])
inc(i)
# copy the rest:
add(result, copy(s, i))
proc transformFile*(infile, outfile: string,
subs: openArray[tuple[pattern: TPeg, repl: string]]) {.
rtl, extern: "npegs$1".} =
## reads in the file `infile`, performs a parallel replacement (calls
## `parallelReplace`) and writes back to `outfile`. Calls ``quit`` if an
## error occurs. This is supposed to be used for quick scripting.
var x = readFile(infile)
if not isNil(x):
var f: TFile
if open(f, outfile, fmWrite):
write(f, x.parallelReplace(subs))
close(f)
else:
quit("cannot open for writing: " & outfile)
else:
quit("cannot open for reading: " & infile)
iterator split*(s: string, sep: TPeg): string =
## Splits the string `s` into substrings.
##
## Substrings are separated by the PEG `sep`.
## Examples:
##
## .. code-block:: nimrod
## for word in split("00232this02939is39an22example111", peg"\d+"):
## writeln(stdout, word)
##
## Results in:
##
## .. code-block:: nimrod
## "this"
## "is"
## "an"
## "example"
##
var
first = 0
last = 0
while last < len(s):
var x = matchLen(s, sep, last)
if x > 0: inc(last, x)
first = last
while last < len(s):
inc(last)
x = matchLen(s, sep, last)
if x > 0: break
if first < last:
yield copy(s, first, last-1)
proc split*(s: string, sep: TPeg): seq[string] {.
nosideEffect, rtl, extern: "npegs$1".} =
## Splits the string `s` into substrings.
accumulateResult(split(s, sep))
# ------------------- scanner -------------------------------------------------
type
TModifier = enum
modNone,
modVerbatim,
modIgnoreCase,
modIgnoreStyle
TTokKind = enum ## enumeration of all tokens
tkInvalid, ## invalid token
tkEof, ## end of file reached
tkAny, ## .
tkAnyRune, ## _
tkIdentifier, ## abc
tkStringLit, ## "abc" or 'abc'
tkCharSet, ## [^A-Z]
tkParLe, ## '('
tkParRi, ## ')'
tkCurlyLe, ## '{'
tkCurlyRi, ## '}'
tkCurlyAt, ## '{@}'
tkArrow, ## '<-'
tkBar, ## '/'
tkStar, ## '*'
tkPlus, ## '+'
tkAmp, ## '&'
tkNot, ## '!'
tkOption, ## '?'
tkAt, ## '@'
tkBuiltin, ## \identifier
tkEscaped, ## \\
tkDollar ## '$'
TToken {.final.} = object ## a token
kind: TTokKind ## the type of the token
modifier: TModifier
literal: string ## the parsed (string) literal
charset: set[char] ## if kind == tkCharSet
index: int ## if kind == tkDollar
TPegLexer = object ## the lexer object.
bufpos: int ## the current position within the buffer
buf: cstring ## the buffer itself
LineNumber: int ## the current line number
lineStart: int ## index of last line start in buffer
colOffset: int ## column to add
filename: string
const
tokKindToStr: array[TTokKind, string] = [
"invalid", "[EOF]", ".", "_", "identifier", "string literal",
"character set", "(", ")", "{", "}", "{@}",
"<-", "/", "*", "+", "&", "!", "?",
"@", "built-in", "escaped", "$"
]
proc HandleCR(L: var TPegLexer, pos: int): int =
assert(L.buf[pos] == '\c')
inc(L.linenumber)
result = pos+1
if L.buf[result] == '\L': inc(result)
L.lineStart = result
proc HandleLF(L: var TPegLexer, pos: int): int =
assert(L.buf[pos] == '\L')
inc(L.linenumber)
result = pos+1
L.lineStart = result
proc init(L: var TPegLexer, input, filename: string, line = 1, col = 0) =
L.buf = input
L.bufpos = 0
L.lineNumber = line
L.colOffset = col
L.lineStart = 0
L.filename = filename
proc getColumn(L: TPegLexer): int {.inline.} =
result = abs(L.bufpos - L.lineStart) + L.colOffset
proc getLine(L: TPegLexer): int {.inline.} =
result = L.linenumber
proc errorStr(L: TPegLexer, msg: string, line = -1, col = -1): string =
var line = if line < 0: getLine(L) else: line
var col = if col < 0: getColumn(L) else: col
result = "$1($2, $3) Error: $4" % [L.filename, $line, $col, msg]
proc handleHexChar(c: var TPegLexer, xi: var int) =
case c.buf[c.bufpos]
of '0'..'9':
xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('0'))
inc(c.bufpos)
of 'a'..'f':
xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('a') + 10)
inc(c.bufpos)
of 'A'..'F':
xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('A') + 10)
inc(c.bufpos)
else: nil
proc getEscapedChar(c: var TPegLexer, tok: var TToken) =
inc(c.bufpos)
case c.buf[c.bufpos]
of 'r', 'R', 'c', 'C':
add(tok.literal, '\c')
Inc(c.bufpos)
of 'l', 'L':
add(tok.literal, '\L')
Inc(c.bufpos)
of 'f', 'F':
add(tok.literal, '\f')
inc(c.bufpos)
of 'e', 'E':
add(tok.literal, '\e')
Inc(c.bufpos)
of 'a', 'A':
add(tok.literal, '\a')
Inc(c.bufpos)
of 'b', 'B':
add(tok.literal, '\b')
Inc(c.bufpos)
of 'v', 'V':
add(tok.literal, '\v')
Inc(c.bufpos)
of 't', 'T':
add(tok.literal, '\t')
Inc(c.bufpos)
of 'x', 'X':
inc(c.bufpos)
var xi = 0
handleHexChar(c, xi)
handleHexChar(c, xi)
if xi == 0: tok.kind = tkInvalid
else: add(tok.literal, Chr(xi))
of '0'..'9':
var val = ord(c.buf[c.bufpos]) - ord('0')
Inc(c.bufpos)
var i = 1
while (i <= 3) and (c.buf[c.bufpos] in {'0'..'9'}):
val = val * 10 + ord(c.buf[c.bufpos]) - ord('0')
inc(c.bufpos)
inc(i)
if val > 0 and val <= 255: add(tok.literal, chr(val))
else: tok.kind = tkInvalid
of '\0'..'\31':
tok.kind = tkInvalid
elif c.buf[c.bufpos] in strutils.letters:
tok.kind = tkInvalid
else:
add(tok.literal, c.buf[c.bufpos])
Inc(c.bufpos)
proc skip(c: var TPegLexer) =
var pos = c.bufpos
var buf = c.buf
while true:
case buf[pos]
of ' ', '\t':
Inc(pos)
of '#':
while not (buf[pos] in {'\c', '\L', '\0'}): inc(pos)
of '\c':
pos = HandleCR(c, pos)
buf = c.buf
of '\L':
pos = HandleLF(c, pos)
buf = c.buf
else:
break # EndOfFile also leaves the loop
c.bufpos = pos
proc getString(c: var TPegLexer, tok: var TToken) =
tok.kind = tkStringLit
var pos = c.bufPos + 1
var buf = c.buf
var quote = buf[pos-1]
while true:
case buf[pos]
of '\\':
c.bufpos = pos
getEscapedChar(c, tok)
pos = c.bufpos
of '\c', '\L', '\0':
tok.kind = tkInvalid
break
elif buf[pos] == quote:
inc(pos)
break
else:
add(tok.literal, buf[pos])
Inc(pos)
c.bufpos = pos
proc getDollar(c: var TPegLexer, tok: var TToken) =
var pos = c.bufPos + 1
var buf = c.buf
if buf[pos] in {'0'..'9'}:
tok.kind = tkDollar
tok.index = 0
while buf[pos] in {'0'..'9'}:
tok.index = tok.index * 10 + ord(buf[pos]) - ord('0')
inc(pos)
else:
tok.kind = tkInvalid
c.bufpos = pos
proc getCharSet(c: var TPegLexer, tok: var TToken) =
tok.kind = tkCharSet
tok.charset = {}
var pos = c.bufPos + 1
var buf = c.buf
var caret = false
if buf[pos] == '^':
inc(pos)
caret = true
while true:
var ch: char
case buf[pos]
of ']':
inc(pos)
break
of '\\':
c.bufpos = pos
getEscapedChar(c, tok)
pos = c.bufpos
ch = tok.literal[tok.literal.len-1]
of '\C', '\L', '\0':
tok.kind = tkInvalid
break
else:
ch = buf[pos]
Inc(pos)
incl(tok.charset, ch)
if buf[pos] == '-':
if buf[pos+1] == ']':
incl(tok.charset, '-')
inc(pos)
else:
inc(pos)
var ch2: char
case buf[pos]
of '\\':
c.bufpos = pos
getEscapedChar(c, tok)
pos = c.bufpos
ch2 = tok.literal[tok.literal.len-1]
of '\C', '\L', '\0':
tok.kind = tkInvalid
break
else:
ch2 = buf[pos]
Inc(pos)
for i in ord(ch)+1 .. ord(ch2):
incl(tok.charset, chr(i))
c.bufpos = pos
if caret: tok.charset = {'\1'..'\xFF'} - tok.charset
proc getSymbol(c: var TPegLexer, tok: var TToken) =
var pos = c.bufpos
var buf = c.buf
while true:
add(tok.literal, buf[pos])
Inc(pos)
if buf[pos] notin strutils.IdentChars: break
c.bufpos = pos
tok.kind = tkIdentifier
proc getBuiltin(c: var TPegLexer, tok: var TToken) =
if c.buf[c.bufpos+1] in strutils.Letters:
inc(c.bufpos)
getSymbol(c, tok)
tok.kind = tkBuiltin
else:
tok.kind = tkEscaped
getEscapedChar(c, tok) # may set tok.kind to tkInvalid
proc getTok(c: var TPegLexer, tok: var TToken) =
tok.kind = tkInvalid
tok.modifier = modNone
setlen(tok.literal, 0)
skip(c)
case c.buf[c.bufpos]
of '{':
inc(c.bufpos)
if c.buf[c.bufpos] == '@' and c.buf[c.bufpos+1] == '}':
tok.kind = tkCurlyAt
inc(c.bufpos, 2)
add(tok.literal, "{@}")
else:
tok.kind = tkCurlyLe
add(tok.literal, '{')
of '}':
tok.kind = tkCurlyRi
inc(c.bufpos)
add(tok.literal, '}')
of '[':
getCharset(c, tok)
of '(':
tok.kind = tkParLe
Inc(c.bufpos)
add(tok.literal, '(')
of ')':
tok.kind = tkParRi
Inc(c.bufpos)
add(tok.literal, ')')
of '.':
tok.kind = tkAny
inc(c.bufpos)
add(tok.literal, '.')
of '_':
tok.kind = tkAnyRune
inc(c.bufpos)
add(tok.literal, '_')
of '\\':
getBuiltin(c, tok)
of '\'', '"': getString(c, tok)
of '$': getDollar(c, tok)
of '\0':
tok.kind = tkEof
tok.literal = "[EOF]"
of 'a'..'z', 'A'..'Z', '\128'..'\255':
getSymbol(c, tok)
if c.buf[c.bufpos] in {'\'', '"', '$'}:
case tok.literal
of "i": tok.modifier = modIgnoreCase
of "y": tok.modifier = modIgnoreStyle
of "v": tok.modifier = modVerbatim
else: nil
setLen(tok.literal, 0)
if c.buf[c.bufpos] == '$':
getDollar(c, tok)
else:
getString(c, tok)
if tok.modifier == modNone: tok.kind = tkInvalid
of '+':
tok.kind = tkPlus
inc(c.bufpos)
add(tok.literal, '+')
of '*':
tok.kind = tkStar
inc(c.bufpos)
add(tok.literal, '+')
of '<':
if c.buf[c.bufpos+1] == '-':
inc(c.bufpos, 2)
tok.kind = tkArrow
add(tok.literal, "<-")
else:
add(tok.literal, '<')
of '/':
tok.kind = tkBar
inc(c.bufpos)
add(tok.literal, '/')
of '?':
tok.kind = tkOption
inc(c.bufpos)
add(tok.literal, '?')
of '!':
tok.kind = tkNot
inc(c.bufpos)
add(tok.literal, '!')
of '&':
tok.kind = tkAmp
inc(c.bufpos)
add(tok.literal, '!')
of '@':
tok.kind = tkAt
inc(c.bufpos)
add(tok.literal, '@')
if c.buf[c.bufpos] == '@':
tok.kind = tkCurlyAt
inc(c.bufpos)
add(tok.literal, '@')
else:
add(tok.literal, c.buf[c.bufpos])
inc(c.bufpos)
proc arrowIsNextTok(c: TPegLexer): bool =
# the only look ahead we need
var pos = c.bufpos
while c.buf[pos] in {'\t', ' '}: inc(pos)
result = c.buf[pos] == '<' and c.buf[pos+1] == '-'
# ----------------------------- parser ----------------------------------------
type
EInvalidPeg* = object of EBase ## raised if an invalid PEG has been detected
TPegParser = object of TPegLexer ## the PEG parser object
tok: TToken
nonterms: seq[PNonTerminal]
modifier: TModifier
captures: int
identIsVerbatim: bool
proc pegError(p: TPegParser, msg: string, line = -1, col = -1) =
var e: ref EInvalidPeg
new(e)
e.msg = errorStr(p, msg, line, col)
raise e
proc getTok(p: var TPegParser) =
getTok(p, p.tok)
if p.tok.kind == tkInvalid: pegError(p, "invalid token")
proc eat(p: var TPegParser, kind: TTokKind) =
if p.tok.kind == kind: getTok(p)
else: pegError(p, tokKindToStr[kind] & " expected")
proc parseExpr(p: var TPegParser): TPeg
proc getNonTerminal(p: var TPegParser, name: string): PNonTerminal =
for i in 0..high(p.nonterms):
result = p.nonterms[i]
if cmpIgnoreStyle(result.name, name) == 0: return
# forward reference:
result = newNonTerminal(name, getLine(p), getColumn(p))
add(p.nonterms, result)
proc modifiedTerm(s: string, m: TModifier): TPeg =
case m
of modNone, modVerbatim: result = term(s)
of modIgnoreCase: result = termIgnoreCase(s)
of modIgnoreStyle: result = termIgnoreStyle(s)
proc modifiedBackref(s: int, m: TModifier): TPeg =
case m
of modNone, modVerbatim: result = backRef(s)
of modIgnoreCase: result = backRefIgnoreCase(s)
of modIgnoreStyle: result = backRefIgnoreStyle(s)
proc primary(p: var TPegParser): TPeg =
case p.tok.kind
of tkAmp:
getTok(p)
return &primary(p)
of tkNot:
getTok(p)
return !primary(p)
of tkAt:
getTok(p)
return @primary(p)
of tkCurlyAt:
getTok(p)
return @@primary(p)
else: nil
case p.tok.kind
of tkIdentifier:
if p.identIsVerbatim:
var m = p.tok.modifier
if m == modNone: m = p.modifier
result = modifiedTerm(p.tok.literal, m)
getTok(p)
elif not arrowIsNextTok(p):
var nt = getNonTerminal(p, p.tok.literal)
incl(nt.flags, ntUsed)
result = nonTerminal(nt)
getTok(p)
else:
pegError(p, "expression expected, but found: " & p.tok.literal)
of tkStringLit:
var m = p.tok.modifier
if m == modNone: m = p.modifier
result = modifiedTerm(p.tok.literal, m)
getTok(p)
of tkCharSet:
if '\0' in p.tok.charset:
pegError(p, "binary zero ('\\0') not allowed in character class")
result = charset(p.tok.charset)
getTok(p)
of tkParLe:
getTok(p)
result = parseExpr(p)
eat(p, tkParRi)
of tkCurlyLe:
getTok(p)
result = capture(parseExpr(p))
eat(p, tkCurlyRi)
inc(p.captures)
of tkAny:
result = any()
getTok(p)
of tkAnyRune:
result = anyRune()
getTok(p)
of tkBuiltin:
case p.tok.literal
of "n": result = newLine()
of "d": result = charset({'0'..'9'})
of "D": result = charset({'\1'..'\xff'} - {'0'..'9'})
of "s": result = charset({' ', '\9'..'\13'})
of "S": result = charset({'\1'..'\xff'} - {' ', '\9'..'\13'})
of "w": result = charset({'a'..'z', 'A'..'Z', '_', '0'..'9'})
of "W": result = charset({'\1'..'\xff'} - {'a'..'z','A'..'Z','_','0'..'9'})
of "a": result = charset({'a'..'z', 'A'..'Z'})
of "A": result = charset({'\1'..'\xff'} - {'a'..'z', 'A'..'Z'})
of "ident": result = pegs.ident
of "letter": result = UnicodeLetter()
of "upper": result = UnicodeUpper()
of "lower": result = UnicodeLower()
of "title": result = UnicodeTitle()
of "white": result = UnicodeWhitespace()
else: pegError(p, "unknown built-in: " & p.tok.literal)
getTok(p)
of tkEscaped:
result = term(p.tok.literal[0])
getTok(p)
of tkDollar:
var m = p.tok.modifier
if m == modNone: m = p.modifier
result = modifiedBackRef(p.tok.index, m)
if p.tok.index < 0 or p.tok.index > p.captures:
pegError(p, "invalid back reference index: " & $p.tok.index)
getTok(p)
else:
pegError(p, "expression expected, but found: " & p.tok.literal)
getTok(p) # we must consume a token here to prevent endless loops!
while true:
case p.tok.kind
of tkOption:
result = ?result
getTok(p)
of tkStar:
result = *result
getTok(p)
of tkPlus:
result = +result
getTok(p)
else: break
proc seqExpr(p: var TPegParser): TPeg =
result = primary(p)
while true:
case p.tok.kind
of tkAmp, tkNot, tkAt, tkStringLit, tkCharset, tkParLe, tkCurlyLe,
tkAny, tkAnyRune, tkBuiltin, tkEscaped, tkDollar, tkCurlyAt:
result = sequence(result, primary(p))
of tkIdentifier:
if not arrowIsNextTok(p):
result = sequence(result, primary(p))
else: break
else: break
proc parseExpr(p: var TPegParser): TPeg =
result = seqExpr(p)
while p.tok.kind == tkBar:
getTok(p)
result = result / seqExpr(p)
proc parseRule(p: var TPegParser): PNonTerminal =
if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
result = getNonTerminal(p, p.tok.literal)
if ntDeclared in result.flags:
pegError(p, "attempt to redefine: " & result.name)
result.line = getLine(p)
result.col = getColumn(p)
getTok(p)
eat(p, tkArrow)
result.rule = parseExpr(p)
incl(result.flags, ntDeclared) # NOW inlining may be attempted
else:
pegError(p, "rule expected, but found: " & p.tok.literal)
proc rawParse(p: var TPegParser): TPeg =
## parses a rule or a PEG expression
if p.tok.kind == tkBuiltin:
case p.tok.literal
of "i":
p.modifier = modIgnoreCase
getTok(p)
of "y":
p.modifier = modIgnoreStyle
getTok(p)
else: nil
if p.tok.kind == tkIdentifier and arrowIsNextTok(p):
result = parseRule(p).rule
while p.tok.kind != tkEof:
discard parseRule(p)
else:
p.identIsVerbatim = true
result = parseExpr(p)
if p.tok.kind != tkEof:
pegError(p, "EOF expected, but found: " & p.tok.literal)
for i in 0..high(p.nonterms):
var nt = p.nonterms[i]
if ntDeclared notin nt.flags:
pegError(p, "undeclared identifier: " & nt.name, nt.line, nt.col)
elif ntUsed notin nt.flags and i > 0:
pegError(p, "unused rule: " & nt.name, nt.line, nt.col)
proc parsePeg*(pattern: string, filename = "pattern", line = 1, col = 0): TPeg =
## constructs a TPeg object from `pattern`. `filename`, `line`, `col` are
## used for error messages, but they only provide start offsets. `parsePeg`
## keeps track of line and column numbers within `pattern`.
var p: TPegParser
init(TPegLexer(p), pattern, filename, line, col)
p.tok.kind = tkInvalid
p.tok.modifier = modNone
p.tok.literal = ""
p.tok.charset = {}
p.nonterms = @[]
p.identIsVerbatim = false
getTok(p)
result = rawParse(p)
proc peg*(pattern: string): TPeg =
## constructs a TPeg object from the `pattern`. The short name has been
## chosen to encourage its use as a raw string modifier::
##
## peg"{\ident} \s* '=' \s* {.*}"
result = parsePeg(pattern, "pattern")
proc escapePeg*(s: string): string =
## escapes `s` so that it is matched verbatim when used as a peg.
result = ""
var inQuote = false
for c in items(s):
case c
of '\0'..'\31', '\'', '"', '\\':
if inQuote:
result.add('\'')
inQuote = false
result.add("\\x")
result.add(toHex(ord(c), 2))
else:
if not inQuote:
result.add('\'')
inQuote = true
result.add(c)
if inQuote: result.add('\'')
when isMainModule:
assert escapePeg("abc''def'") == r"'abc'\x27\x27'def'\x27"
assert match("(a b c)", peg"'(' @ ')'")
assert match("W_HI_Le", peg"\y 'while'")
assert(not match("W_HI_L", peg"\y 'while'"))
assert(not match("W_HI_Le", peg"\y v'while'"))
assert match("W_HI_Le", peg"y'while'")
assert($ +digits == $peg"\d+")
assert "0158787".match(peg"\d+")
assert "ABC 0232".match(peg"\w+\s+\d+")
assert "ABC".match(peg"\d+ / \w+")
for word in split("00232this02939is39an22example111", peg"\d+"):
writeln(stdout, word)
assert matchLen("key", ident) == 3
var pattern = sequence(ident, *whitespace, term('='), *whitespace, ident)
assert matchLen("key1= cal9", pattern) == 11
var ws = newNonTerminal("ws", 1, 1)
ws.rule = *whitespace
var expr = newNonTerminal("expr", 1, 1)
expr.rule = sequence(capture(ident), *sequence(
nonterminal(ws), term('+'), nonterminal(ws), nonterminal(expr)))
var c: TCaptures
var s = "a+b + c +d+e+f"
assert rawMatch(s, expr.rule, 0, c) == len(s)
var a = ""
for i in 0..c.ml-1:
a.add(copy(s, c.matches[i][0], c.matches[i][1]))
assert a == "abcdef"
#echo expr.rule
#const filename = "lib/devel/peg/grammar.txt"
#var grammar = parsePeg(newFileStream(filename, fmRead), filename)
#echo "a <- [abc]*?".match(grammar)
assert find("_____abc_______", term("abc"), 2) == 5
assert match("_______ana", peg"A <- 'ana' / . A")
assert match("abcs%%%", peg"A <- ..A / .A / '%'")
if "abc" =~ peg"{'a'}'bc' 'xyz' / {\ident}":
assert matches[0] == "abc"
else:
assert false
var g2 = peg"""S <- A B / C D
A <- 'a'+
B <- 'b'+
C <- 'c'+
D <- 'd'+
"""
assert($g2 == "((A B) / (C D))")
assert match("cccccdddddd", g2)
assert("var1=key; var2=key2".replace(peg"{\ident}'='{\ident}", "$1<-$2$2") ==
"var1<-keykey; var2<-key2key2")
assert "var1=key; var2=key2".endsWith(peg"{\ident}'='{\ident}")
if "aaaaaa" =~ peg"'aa' !. / ({'a'})+":
assert matches[0] == "a"
else:
assert false
var matches: array[0..5, string]
if match("abcdefg", peg"c {d} ef {g}", matches, 2):
assert matches[0] == "d"
assert matches[1] == "g"
else:
assert false
for x in findAll("abcdef", peg"{.}", 3):
echo x
if "f(a, b)" =~ peg"{[0-9]+} / ({\ident} '(' {@} ')')":
assert matches[0] == "f"
assert matches[1] == "a, b"
else:
assert false
assert match("eine übersicht und außerdem", peg"(\letter \white*)+")
# ß is not a lower cased letter?!
assert match("eine übersicht und auerdem", peg"(\lower \white*)+")
assert match("EINE ÜBERSICHT UND AUSSERDEM", peg"(\upper \white*)+")
assert(not match("456678", peg"(\letter)+"))