diff options
Diffstat (limited to 'lib/pure/unidecode/unidecode.nim')
-rw-r--r-- | lib/pure/unidecode/unidecode.nim | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/lib/pure/unidecode/unidecode.nim b/lib/pure/unidecode/unidecode.nim new file mode 100644 index 000000000..9affc53f6 --- /dev/null +++ b/lib/pure/unidecode/unidecode.nim @@ -0,0 +1,64 @@ +# +# +# Nim's Runtime Library +# (c) Copyright 2012 Andreas Rumpf +# +# See the file "copying.txt", included in this +# distribution, for details about the copyright. +# + +## This module is based on Python's [Unidecode](https://pypi.org/project/Unidecode/) +## module by Tomaz Solc, which in turn is based on the +## [Text::Unidecode](https://metacpan.org/pod/Text::Unidecode) +## Perl module by Sean M. Burke. +## +## It provides a `unidecode proc <#unidecode,string>`_ that does +## Unicode to ASCII transliterations: It finds the sequence of ASCII characters +## that is the closest approximation to the Unicode string. +## +## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some +## information is lost in this transformation, of course, since several Unicode +## strings can be transformed to the same ASCII representation. So this is a +## strictly one-way transformation. However, a human reader will probably +## still be able to guess from the context, what the original string was. +## +## This module needs the data file `unidecode.dat` to work: This file is +## embedded as a resource into your application by default. You can also +## define the symbol `--define:noUnidecodeTable` during compile time and +## use the `loadUnidecodeTable proc <#loadUnidecodeTable>`_ to initialize +## this module. + +import std/unicode + +when not defined(noUnidecodeTable): + import std/strutils + + const translationTable = splitLines(slurp"unidecode/unidecode.dat") +else: + # shared is fine for threading: + var translationTable: seq[string] + +proc loadUnidecodeTable*(datafile = "unidecode.dat") = + ## Loads the datafile that `unidecode <#unidecode,string>`_ needs to work. + ## This is only required if the module was compiled with the + ## `--define:noUnidecodeTable` switch. This needs to be called by the + ## main thread before any thread can make a call to `unidecode`. + when defined(noUnidecodeTable): + newSeq(translationTable, 0xffff) + var i = 0 + for line in lines(datafile): + translationTable[i] = line + inc(i) + +proc unidecode*(s: string): string = + ## Finds the sequence of ASCII characters that is the closest approximation + ## to the UTF-8 string `s`. + runnableExamples: + doAssert unidecode("北京") == "Bei Jing " + doAssert unidecode("Äußerst") == "Ausserst" + + result = "" + for r in runes(s): + var c = int(r) + if c <=% 127: add(result, chr(c)) + elif c <% translationTable.len: add(result, translationTable[c - 128]) |