From dcec174b2af3af816180772f948575f896d58d20 Mon Sep 17 00:00:00 2001 From: bptato Date: Tue, 25 Jul 2023 22:10:28 +0200 Subject: Add compileMatchRegex This makes it so that host = 'example\.org' mandates an exact match, but host = '^example' matches example.org, example.com, etc. (Previously, 'example\.org' would have matched exampleexample.org as well, which was quite counter-intuitive.) --- doc/config.md | 42 ++++++++++++++++++++++++++++++++++-------- src/config/config.nim | 8 ++++---- src/js/regex.nim | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/doc/config.md b/doc/config.md index 443d7003..ccaa64ab 100644 --- a/doc/config.md +++ b/doc/config.md @@ -35,6 +35,8 @@ examples. * [Keybindings](#keybindings) * [Pager actions](#pager-actions) * [Line-editing actions](#line-editing-actions) +* [Appendix](#appendix) + * [Regex handling](#regex-handling) ## Start @@ -314,7 +316,9 @@ Omnirule options: match regex Regular expression used to match the input string. Note that websites -passed as arguments are matched as well. +passed as arguments are matched as well.
+Note: regexes are handled according to the [regex handling](#regex-handling) +rules. @@ -335,12 +339,12 @@ Examples: ``` # Enable cookies on the orange website for log-in. [[siteconf]] -url = '^https://news\.ycombinator\.com/.*' +url = 'https://news\.ycombinator\.com/.*' cookie = true # Redirect npr.org to text.npr.org. [[siteconf]] -host = '^(www\.)?npr\.org$' +host = '(www\.)?npr\.org' rewrite-url = ''' (x) => { x.host = "text.npr.org"; @@ -351,10 +355,10 @@ rewrite-url = ''' # Allow cookie sharing on *sr.ht domains. [[siteconf]] -host = '^.*sr\.ht$' +host = '.*sr\.ht' cookie = true share-cookie-jar = 'sr.ht' -third-party-cookie = '^.*\.sr.ht$' +third-party-cookie = '.*\.sr.ht' ``` Siteconf options: @@ -371,14 +375,18 @@ Siteconf options: url regex Regular expression used to match the URL. Either this or the `host` option -must be specified. +must be specified.
+Note: regexes are handled according to the [regex handling](#regex-handling) +rules. host regex Regular expression used to match the host part of the URL (i.e. domain -name/ip address.) Either this or the `url` option must be specified. +name/ip address.) Either this or the `url` option must be specified.
+Note: regexes are handled according to the [regex handling](#regex-handling) +rules. @@ -399,7 +407,9 @@ false for all websites. third-party-cookie regex/array of regexes Domains for which third-party cookies are allowed on this domain. Note: -this only works for buffers which share the same cookie jar. +this only works for buffers which share the same cookie jar.
+Note: regexes are handled according to the [regex handling](#regex-handling) +rules. @@ -876,3 +886,19 @@ as a word boundary. # Control+W deletes everything before the cursor until it reaches a space. 'C-w' = 'line.clearWord(x => x == " ")' ``` + +## Appendix + +### Regex handling + +Regular expressions are assumed to be exact matches, except when they start +with a caret (^) sign or end with an unescaped dollar ($) sign. + +In other words, the following transformations occur: + +``` +^abcd -> ^abcd +efgh$ -> efgh$ +^ijkl$ -> ^ijkl$ +mnop -> ^mnop$ +``` diff --git a/src/config/config.nim b/src/config/config.nim index b546ce3c..dfcb63c4 100644 --- a/src/config/config.nim +++ b/src/config/config.nim @@ -183,11 +183,11 @@ proc getSiteConfig*(config: Config, jsctx: JSContext): seq[SiteConfig] = images: sc.images ) if sc.url.isSome: - conf.url = opt(compileRegex(sc.url.get, 0)) + conf.url = opt(compileMatchRegex(sc.url.get)) elif sc.host.isSome: - conf.host = opt(compileRegex(sc.host.get, 0)) + conf.host = opt(compileMatchRegex(sc.host.get)) for rule in sc.third_party_cookie: - conf.third_party_cookie.add(compileRegex(rule, 0).get) + conf.third_party_cookie.add(compileMatchRegex(rule).get) if sc.rewrite_url.isSome: let fun = jsctx.eval(sc.rewrite_url.get, "", JS_EVAL_TYPE_GLOBAL) @@ -201,7 +201,7 @@ proc getSiteConfig*(config: Config, jsctx: JSContext): seq[SiteConfig] = proc getOmniRules*(config: Config, jsctx: JSContext): seq[OmniRule] = for rule in config.omnirule: - let re = compileRegex(rule.match, 0) + let re = compileMatchRegex(rule.match) var conf = OmniRule( match: re.get ) diff --git a/src/js/regex.nim b/src/js/regex.nim index d1ec34a5..eb6ef80f 100644 --- a/src/js/regex.nim +++ b/src/js/regex.nim @@ -68,6 +68,41 @@ proc compileRegex*(buf: string, flags: int): Result[Regex, string] = regex.bytecode = bytecode return ok(regex) +func countBackslashes(buf: string, i: int): int = + var j = 0 + for i in countdown(i, 0): + if buf[i] != '\\': + break + inc j + return j + +# ^abcd -> ^abcd +# efgh$ -> efgh$ +# ^ijkl$ -> ^ijkl$ +# mnop -> ^mnop$ +proc compileMatchRegex*(buf: string): Result[Regex, string] = + if buf.len == 0: + return compileRegex(buf, 0) + if buf[0] == '^': + return compileRegex(buf, 0) + if buf[^1] == '$': + # Check whether the final dollar sign is escaped. + if buf.len == 1 or buf[^2] != '\\': + return compileRegex(buf, 0) + let j = buf.countBackslashes(buf.high - 2) + if j mod 2 == 1: # odd, because we do not count the last backslash + return compileRegex(buf, 0) + # escaped. proceed as if no dollar sign was at the end + if buf[^1] == '\\': + # Check if the regex contains an invalid trailing backslash. + let j = buf.countBackslashes(buf.high - 1) + if j mod 2 != 1: # odd, because we do not count the last backslash + return err("unexpected end") + var buf2 = "^" + buf2 &= buf + buf2 &= "$" + return compileRegex(buf2, 0) + proc compileSearchRegex*(str: string): Result[Regex, string] = # Parse any applicable flags in regex/. The last forward slash is # dropped when is empty, and interpreted as a character when the -- cgit 1.4.1-2-gfad0 f='#n97'>97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171