diff options
Diffstat (limited to 'lib/pure/unidecode/unidecode.nim')
-rw-r--r--[-rwxr-xr-x] | lib/pure/unidecode/unidecode.nim | 91 |
1 files changed, 45 insertions, 46 deletions
diff --git a/lib/pure/unidecode/unidecode.nim b/lib/pure/unidecode/unidecode.nim index 52f9b6b1a..9affc53f6 100755..100644 --- a/lib/pure/unidecode/unidecode.nim +++ b/lib/pure/unidecode/unidecode.nim @@ -1,65 +1,64 @@ # # -# Nimrod's Runtime Library -# (c) Copyright 2010 Andreas Rumpf +# Nim's Runtime Library +# (c) Copyright 2012 Andreas Rumpf # # See the file "copying.txt", included in this # distribution, for details about the copyright. # -## This module is based on Python's Unidecode module by Tomaz Solc, -## which in turn is based on the ``Text::Unidecode`` Perl module by -## Sean M. Burke -## (http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm ). +## This module is based on Python's [Unidecode](https://pypi.org/project/Unidecode/) +## module by Tomaz Solc, which in turn is based on the +## [Text::Unidecode](https://metacpan.org/pod/Text::Unidecode) +## Perl module by Sean M. Burke. ## -## It provides a single proc that does Unicode to ASCII transliterations: -## It finds the sequence of ASCII characters that is the closest approximation -## to the Unicode string. +## It provides a `unidecode proc <#unidecode,string>`_ that does +## Unicode to ASCII transliterations: It finds the sequence of ASCII characters +## that is the closest approximation to the Unicode string. ## -## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some -## information is lost in this transformation, of course, since several Unicode -## strings can be transformed in the same ASCII representation. So this is a -## strictly one-way transformation. However a human reader will probably -## still be able to guess what original string was meant from the context. +## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some +## information is lost in this transformation, of course, since several Unicode +## strings can be transformed to the same ASCII representation. So this is a +## strictly one-way transformation. However, a human reader will probably +## still be able to guess from the context, what the original string was. ## -## This module needs the data file "unidecode.dat" to work, so it has to be -## shipped with the application! +## This module needs the data file `unidecode.dat` to work: This file is +## embedded as a resource into your application by default. You can also +## define the symbol `--define:noUnidecodeTable` during compile time and +## use the `loadUnidecodeTable proc <#loadUnidecodeTable>`_ to initialize +## this module. -import unicode +import std/unicode -proc loadTranslationTable(filename: string): seq[string] = - newSeq(result, 0xffff) - var i = 0 - for line in lines(filename): - result[i] = line - inc(i) +when not defined(noUnidecodeTable): + import std/strutils -var - translationTable: seq[string] - -var - datafile* = "unidecode.dat" ## location can be overwritten for deployment + const translationTable = splitLines(slurp"unidecode/unidecode.dat") +else: + # shared is fine for threading: + var translationTable: seq[string] -proc unidecode*(s: string): string = +proc loadUnidecodeTable*(datafile = "unidecode.dat") = + ## Loads the datafile that `unidecode <#unidecode,string>`_ needs to work. + ## This is only required if the module was compiled with the + ## `--define:noUnidecodeTable` switch. This needs to be called by the + ## main thread before any thread can make a call to `unidecode`. + when defined(noUnidecodeTable): + newSeq(translationTable, 0xffff) + var i = 0 + for line in lines(datafile): + translationTable[i] = line + inc(i) + +proc unidecode*(s: string): string = ## Finds the sequence of ASCII characters that is the closest approximation ## to the UTF-8 string `s`. - ## - ## Example: - ## - ## ..code-block:: nimrod - ## unidecode("\\x53\\x17\\x4E\\xB0") - ## - ## Results in: "Bei Jing" - ## + runnableExamples: + doAssert unidecode("北京") == "Bei Jing " + doAssert unidecode("Äußerst") == "Ausserst" + result = "" - for r in runes(s): + for r in runes(s): var c = int(r) if c <=% 127: add(result, chr(c)) - elif c <=% 0xffff: - if isNil(translationTable): - translationTable = loadTranslationTable(datafile) - add(result, translationTable[c-128]) - -when isMainModule: - echo unidecode("Äußerst") - + elif c <% translationTable.len: add(result, translationTable[c - 128]) |