diff options
author | bptato <nincsnevem662@gmail.com> | 2023-07-25 22:10:28 +0200 |
---|---|---|
committer | bptato <nincsnevem662@gmail.com> | 2023-07-25 22:10:28 +0200 |
commit | dcec174b2af3af816180772f948575f896d58d20 (patch) | |
tree | 72006d4d310f7f7411b9e684567786e9b634f902 | |
parent | 5153b064d59b627cd2ea061bf88078cbfddfafa6 (diff) | |
download | chawan-dcec174b2af3af816180772f948575f896d58d20.tar.gz |
Add compileMatchRegex
This makes it so that host = 'example\.org' mandates an exact match, but host = '^example' matches example.org, example.com, etc. (Previously, 'example\.org' would have matched exampleexample.org as well, which was quite counter-intuitive.)
-rw-r--r-- | doc/config.md | 42 | ||||
-rw-r--r-- | src/config/config.nim | 8 | ||||
-rw-r--r-- | src/js/regex.nim | 35 |
3 files changed, 73 insertions, 12 deletions
diff --git a/doc/config.md b/doc/config.md index 443d7003..ccaa64ab 100644 --- a/doc/config.md +++ b/doc/config.md @@ -35,6 +35,8 @@ examples. * [Keybindings](#keybindings) * [Pager actions](#pager-actions) * [Line-editing actions](#line-editing-actions) +* [Appendix](#appendix) + * [Regex handling](#regex-handling) ## Start @@ -314,7 +316,9 @@ Omnirule options: <td>match</td> <td>regex</td> <td>Regular expression used to match the input string. Note that websites -passed as arguments are matched as well.</td> +passed as arguments are matched as well.<br> +Note: regexes are handled according to the [regex handling](#regex-handling) +rules.</td> </tr> <tr> @@ -335,12 +339,12 @@ Examples: ``` # Enable cookies on the orange website for log-in. [[siteconf]] -url = '^https://news\.ycombinator\.com/.*' +url = 'https://news\.ycombinator\.com/.*' cookie = true # Redirect npr.org to text.npr.org. [[siteconf]] -host = '^(www\.)?npr\.org$' +host = '(www\.)?npr\.org' rewrite-url = ''' (x) => { x.host = "text.npr.org"; @@ -351,10 +355,10 @@ rewrite-url = ''' # Allow cookie sharing on *sr.ht domains. [[siteconf]] -host = '^.*sr\.ht$' +host = '.*sr\.ht' cookie = true share-cookie-jar = 'sr.ht' -third-party-cookie = '^.*\.sr.ht$' +third-party-cookie = '.*\.sr.ht' ``` Siteconf options: @@ -371,14 +375,18 @@ Siteconf options: <td>url</td> <td>regex</td> <td>Regular expression used to match the URL. Either this or the `host` option -must be specified.</td> +must be specified.<br> +Note: regexes are handled according to the [regex handling](#regex-handling) +rules.</td> </tr> <tr> <td>host</td> <td>regex</td> <td>Regular expression used to match the host part of the URL (i.e. domain -name/ip address.) Either this or the `url` option must be specified.</td> +name/ip address.) Either this or the `url` option must be specified.<br> +Note: regexes are handled according to the [regex handling](#regex-handling) +rules.</td> </tr> <tr> @@ -399,7 +407,9 @@ false for all websites.</td> <td>third-party-cookie</td> <td>regex/array of regexes</td> <td>Domains for which third-party cookies are allowed on this domain. Note: -this only works for buffers which share the same cookie jar.</td> +this only works for buffers which share the same cookie jar.<br> +Note: regexes are handled according to the [regex handling](#regex-handling) +rules.</td> </tr> <tr> @@ -876,3 +886,19 @@ as a word boundary. # Control+W deletes everything before the cursor until it reaches a space. 'C-w' = 'line.clearWord(x => x == " ")' ``` + +## Appendix + +### Regex handling + +Regular expressions are assumed to be exact matches, except when they start +with a caret (^) sign or end with an unescaped dollar ($) sign. + +In other words, the following transformations occur: + +``` +^abcd -> ^abcd +efgh$ -> efgh$ +^ijkl$ -> ^ijkl$ +mnop -> ^mnop$ +``` diff --git a/src/config/config.nim b/src/config/config.nim index b546ce3c..dfcb63c4 100644 --- a/src/config/config.nim +++ b/src/config/config.nim @@ -183,11 +183,11 @@ proc getSiteConfig*(config: Config, jsctx: JSContext): seq[SiteConfig] = images: sc.images ) if sc.url.isSome: - conf.url = opt(compileRegex(sc.url.get, 0)) + conf.url = opt(compileMatchRegex(sc.url.get)) elif sc.host.isSome: - conf.host = opt(compileRegex(sc.host.get, 0)) + conf.host = opt(compileMatchRegex(sc.host.get)) for rule in sc.third_party_cookie: - conf.third_party_cookie.add(compileRegex(rule, 0).get) + conf.third_party_cookie.add(compileMatchRegex(rule).get) if sc.rewrite_url.isSome: let fun = jsctx.eval(sc.rewrite_url.get, "<siteconf>", JS_EVAL_TYPE_GLOBAL) @@ -201,7 +201,7 @@ proc getSiteConfig*(config: Config, jsctx: JSContext): seq[SiteConfig] = proc getOmniRules*(config: Config, jsctx: JSContext): seq[OmniRule] = for rule in config.omnirule: - let re = compileRegex(rule.match, 0) + let re = compileMatchRegex(rule.match) var conf = OmniRule( match: re.get ) diff --git a/src/js/regex.nim b/src/js/regex.nim index d1ec34a5..eb6ef80f 100644 --- a/src/js/regex.nim +++ b/src/js/regex.nim @@ -68,6 +68,41 @@ proc compileRegex*(buf: string, flags: int): Result[Regex, string] = regex.bytecode = bytecode return ok(regex) +func countBackslashes(buf: string, i: int): int = + var j = 0 + for i in countdown(i, 0): + if buf[i] != '\\': + break + inc j + return j + +# ^abcd -> ^abcd +# efgh$ -> efgh$ +# ^ijkl$ -> ^ijkl$ +# mnop -> ^mnop$ +proc compileMatchRegex*(buf: string): Result[Regex, string] = + if buf.len == 0: + return compileRegex(buf, 0) + if buf[0] == '^': + return compileRegex(buf, 0) + if buf[^1] == '$': + # Check whether the final dollar sign is escaped. + if buf.len == 1 or buf[^2] != '\\': + return compileRegex(buf, 0) + let j = buf.countBackslashes(buf.high - 2) + if j mod 2 == 1: # odd, because we do not count the last backslash + return compileRegex(buf, 0) + # escaped. proceed as if no dollar sign was at the end + if buf[^1] == '\\': + # Check if the regex contains an invalid trailing backslash. + let j = buf.countBackslashes(buf.high - 1) + if j mod 2 != 1: # odd, because we do not count the last backslash + return err("unexpected end") + var buf2 = "^" + buf2 &= buf + buf2 &= "$" + return compileRegex(buf2, 0) + proc compileSearchRegex*(str: string): Result[Regex, string] = # Parse any applicable flags in regex/<flags>. The last forward slash is # dropped when <flags> is empty, and interpreted as a character when the |