summary refs log tree commit diff stats
path: root/lib/pure/unidecode/unidecode.nim
diff options
context:
space:
mode:
Diffstat (limited to 'lib/pure/unidecode/unidecode.nim')
-rw-r--r--[-rwxr-xr-x]lib/pure/unidecode/unidecode.nim91
1 files changed, 45 insertions, 46 deletions
diff --git a/lib/pure/unidecode/unidecode.nim b/lib/pure/unidecode/unidecode.nim
index 52f9b6b1a..9affc53f6 100755..100644
--- a/lib/pure/unidecode/unidecode.nim
+++ b/lib/pure/unidecode/unidecode.nim
@@ -1,65 +1,64 @@
 #
 #
-#            Nimrod's Runtime Library
-#        (c) Copyright 2010 Andreas Rumpf
+#            Nim's Runtime Library
+#        (c) Copyright 2012 Andreas Rumpf
 #
 #    See the file "copying.txt", included in this
 #    distribution, for details about the copyright.
 #
 
-## This module is based on Python's Unidecode module by Tomaz Solc, 
-## which in turn is based on the ``Text::Unidecode`` Perl module by 
-## Sean M. Burke 
-## (http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm ).
+## This module is based on Python's [Unidecode](https://pypi.org/project/Unidecode/)
+## module by Tomaz Solc, which in turn is based on the
+## [Text::Unidecode](https://metacpan.org/pod/Text::Unidecode)
+## Perl module by Sean M. Burke.
 ##
-## It provides a single proc that does Unicode to ASCII transliterations:
-## It finds the sequence of ASCII characters that is the closest approximation
-## to the Unicode string.
+## It provides a `unidecode proc <#unidecode,string>`_ that does
+## Unicode to ASCII transliterations: It finds the sequence of ASCII characters
+## that is the closest approximation to the Unicode string.
 ##
-## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some 
-## information is lost in this transformation, of course, since several Unicode 
-## strings can be transformed in the same ASCII representation. So this is a
-## strictly one-way transformation. However a human reader will probably 
-## still be able to guess what original string was meant from the context.
+## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some
+## information is lost in this transformation, of course, since several Unicode
+## strings can be transformed to the same ASCII representation. So this is a
+## strictly one-way transformation. However, a human reader will probably
+## still be able to guess from the context, what the original string was.
 ##
-## This module needs the data file "unidecode.dat" to work, so it has to be
-## shipped with the application! 
+## This module needs the data file `unidecode.dat` to work: This file is
+## embedded as a resource into your application by default. You can also
+## define the symbol `--define:noUnidecodeTable` during compile time and
+## use the `loadUnidecodeTable proc <#loadUnidecodeTable>`_ to initialize
+## this module.
 
-import unicode
+import std/unicode
 
-proc loadTranslationTable(filename: string): seq[string] =  
-  newSeq(result, 0xffff)
-  var i = 0
-  for line in lines(filename):
-    result[i] = line
-    inc(i)
+when not defined(noUnidecodeTable):
+  import std/strutils
 
-var 
-  translationTable: seq[string]
-  
-var
-  datafile* = "unidecode.dat"   ## location can be overwritten for deployment
+  const translationTable = splitLines(slurp"unidecode/unidecode.dat")
+else:
+  # shared is fine for threading:
+  var translationTable: seq[string]
 
-proc unidecode*(s: string): string = 
+proc loadUnidecodeTable*(datafile = "unidecode.dat") =
+  ## Loads the datafile that `unidecode <#unidecode,string>`_ needs to work.
+  ## This is only required if the module was compiled with the
+  ## `--define:noUnidecodeTable` switch. This needs to be called by the
+  ## main thread before any thread can make a call to `unidecode`.
+  when defined(noUnidecodeTable):
+    newSeq(translationTable, 0xffff)
+    var i = 0
+    for line in lines(datafile):
+      translationTable[i] = line
+      inc(i)
+
+proc unidecode*(s: string): string =
   ## Finds the sequence of ASCII characters that is the closest approximation
   ## to the UTF-8 string `s`.
-  ##
-  ## Example: 
-  ## 
-  ## ..code-block:: nimrod
-  ##   unidecode("\\x53\\x17\\x4E\\xB0")
-  ##
-  ## Results in: "Bei Jing"
-  ##
+  runnableExamples:
+    doAssert unidecode("北京") == "Bei Jing "
+    doAssert unidecode("Äußerst") == "Ausserst"
+
   result = ""
-  for r in runes(s): 
+  for r in runes(s):
     var c = int(r)
     if c <=% 127: add(result, chr(c))
-    elif c <=% 0xffff: 
-      if isNil(translationTable):
-        translationTable = loadTranslationTable(datafile)
-      add(result, translationTable[c-128])
-
-when isMainModule: 
-  echo unidecode("Äußerst")
-
+    elif c <% translationTable.len: add(result, translationTable[c - 128])