diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/punycode.nim | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/lib/punycode.nim b/lib/punycode.nim new file mode 100644 index 00000000..bfd77e40 --- /dev/null +++ b/lib/punycode.nim @@ -0,0 +1,211 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2016 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## .. note:: In order to use this module, run `nimble install punycode`. +## +## Implements a representation of Unicode with the limited +## ASCII character subset. + +import strutils +import unicode + +# issue #3045 + +const + Base = 36 + TMin = 1 + TMax = 26 + Skew = 38 + Damp = 700 + InitialBias = 72 + InitialN = 128 + Delimiter = '-' + +type + PunyError* = object of ValueError + +func decodeDigit(x: char): int {.raises: [PunyError].} = + if '0' <= x and x <= '9': + result = ord(x) - (ord('0') - 26) + elif 'A' <= x and x <= 'Z': + result = ord(x) - ord('A') + elif 'a' <= x and x <= 'z': + result = ord(x) - ord('a') + else: + raise newException(PunyError, "Bad input") + +func encodeDigit(digit: int): Rune {.raises: [PunyError].} = + if 0 <= digit and digit < 26: + result = Rune(digit + ord('a')) + elif 26 <= digit and digit < 36: + result = Rune(digit + (ord('0') - 26)) + else: + raise newException(PunyError, "internal error in punycode encoding") + +func isBasic(c: char): bool = ord(c) < 0x80 +func isBasic(r: Rune): bool = int(r) < 0x80 + +func adapt(delta, numPoints: int, first: bool): int = + var d = if first: delta div Damp else: delta div 2 + d += d div numPoints + var k = 0 + while d > ((Base-TMin)*TMax) div 2: + d = d div (Base - TMin) + k += Base + result = k + (Base - TMin + 1) * d div (d + Skew) + +func encode*(prefix, s: string): string {.raises: [PunyError].} = + ## Encode a string that may contain Unicode. + ## Prepend `prefix` to the result + result = prefix + var (d, n, bias) = (0, InitialN, InitialBias) + var (b, remaining) = (0, 0) + for r in s.runes: + if r.isBasic: + # basic Ascii character + inc b + result.add($r) + else: + # special character + inc remaining + + var h = b + if b > 0: + result.add(Delimiter) # we have some Ascii chars + while remaining != 0: + var m: int = high(int32) + for r in s.runes: + if m > int(r) and int(r) >= n: + m = int(r) + d += (m - n) * (h + 1) + if d < 0: + raise newException(PunyError, "invalid label " & s) + n = m + for r in s.runes: + if int(r) < n: + inc d + if d < 0: + raise newException(PunyError, "invalid label " & s) + continue + if int(r) > n: + continue + var q = d + var k = Base + while true: + var t = k - bias + if t < TMin: + t = TMin + elif t > TMax: + t = TMax + if q < t: + break + result.add($encodeDigit(t + (q - t) mod (Base - t))) + q = (q - t) div (Base - t) + k += Base + result.add($encodeDigit(q)) + bias = adapt(d, h + 1, h == b) + d = 0 + inc h + dec remaining + inc d + inc n + +func encode*(s: string): string {.raises: [PunyError].} = + ## Encode a string that may contain Unicode. Prefix is empty. + result = encode("", s) + +func decode*(encoded: string): string {.raises: [PunyError].} = + ## Decode a Punycode-encoded string + var + n = InitialN + i = 0 + bias = InitialBias + var d = rfind(encoded, Delimiter) + var output: seq[Rune] + + if d > 0: + # found Delimiter + for j in 0..<d: + var c = encoded[j] # char + if not c.isBasic: + raise newException(PunyError, "Encoded contains a non-basic char") + output.add(Rune(c)) # add the character + inc d + else: + d = 0 # set to first index + + while (d < len(encoded)): + var oldi = i + var w = 1 + var k = Base + while true: + if d == len(encoded): + raise newException(PunyError, "Bad input: " & encoded) + var c = encoded[d]; inc d + var digit = int(decodeDigit(c)) + if digit > (high(int32) - i) div w: + raise newException(PunyError, "Too large a value: " & $digit) + i += digit * w + var t: int + if k <= bias: + t = TMin + elif k >= bias + TMax: + t = TMax + else: + t = k - bias + if digit < t: + break + w *= Base - t + k += Base + bias = adapt(i - oldi, len(output) + 1, oldi == 0) + + if i div (len(output) + 1) > high(int32) - n: + raise newException(PunyError, "Value too large") + + n += i div (len(output) + 1) + i = i mod (len(output) + 1) + insert(output, Rune(n), i) + inc i + + result = $output + +runnableExamples: + static: + block: + doAssert encode("") == "" + doAssert encode("a") == "a-" + doAssert encode("A") == "A-" + doAssert encode("3") == "3-" + doAssert encode("-") == "--" + doAssert encode("--") == "---" + doAssert encode("abc") == "abc-" + doAssert encode("London") == "London-" + doAssert encode("Lloyd-Atkinson") == "Lloyd-Atkinson-" + doAssert encode("This has spaces") == "This has spaces-" + doAssert encode("ü") == "tda" + doAssert encode("München") == "Mnchen-3ya" + doAssert encode("Mnchen-3ya") == "Mnchen-3ya-" + doAssert encode("München-Ost") == "Mnchen-Ost-9db" + doAssert encode("Bahnhof München-Ost") == "Bahnhof Mnchen-Ost-u6b" + block: + doAssert decode("") == "" + doAssert decode("a-") == "a" + doAssert decode("A-") == "A" + doAssert decode("3-") == "3" + doAssert decode("--") == "-" + doAssert decode("---") == "--" + doAssert decode("abc-") == "abc" + doAssert decode("London-") == "London" + doAssert decode("Lloyd-Atkinson-") == "Lloyd-Atkinson" + doAssert decode("This has spaces-") == "This has spaces" + doAssert decode("tda") == "ü" + doAssert decode("Mnchen-3ya") == "München" + doAssert decode("Mnchen-3ya-") == "Mnchen-3ya" + doAssert decode("Mnchen-Ost-9db") == "München-Ost" + doAssert decode("Bahnhof Mnchen-Ost-u6b") == "Bahnhof München-Ost" |