summary refs log tree commit diff stats
path: root/lib/pure/unidecode/unidecode.nim
blob: 9d8843f064058414352b17879500fb7627299a6e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#
#
#            Nim's Runtime Library
#        (c) Copyright 2012 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## This module is based on Python's Unidecode module by Tomaz Solc,
## which in turn is based on the ``Text::Unidecode`` Perl module by
## Sean M. Burke
## (http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm ).
##
## It provides a single proc that does Unicode to ASCII transliterations:
## It finds the sequence of ASCII characters that is the closest approximation
## to the Unicode string.
##
## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some
## information is lost in this transformation, of course, since several Unicode
## strings can be transformed in the same ASCII representation. So this is a
## strictly one-way transformation. However a human reader will probably
## still be able to guess what original string was meant from the context.
##
## This module needs the data file "unidecode.dat" to work: You can either
## ship this file with your application and initialize this module with the
## `loadUnidecodeTable` proc or you can define the ``embedUnidecodeTable``
## symbol to embed the file as a resource into your application.

import unicode

when defined(embedUnidecodeTable):
  import strutils

  const translationTable = splitLines(slurp"unidecode/unidecode.dat")
else:
  # shared is fine for threading:
  var translationTable: seq[string]

proc loadUnidecodeTable*(datafile = "unidecode.dat") =
  ## loads the datafile that `unidecode` to work. Unless this module is
  ## compiled with the ``embedUnidecodeTable`` symbol defined, this needs
  ## to be called by the main thread before any thread can make a call
  ## to `unidecode`.
  when not defined(embedUnidecodeTable):
    newSeq(translationTable, 0xffff)
    var i = 0
    for line in lines(datafile):
      translationTable[i] = line.string
      inc(i)

proc unidecode*(s: string): string =
  ## Finds the sequence of ASCII characters that is the closest approximation
  ## to the UTF-8 string `s`.
  ##
  ## Example:
  ##
  ## ..code-block:: nim
  ##
  ##   unidecode("\x53\x17\x4E\xB0")
  ##
  ## Results in: "Bei Jing"
  ##
  assert(not isNil(translationTable))
  result = ""
  for r in runes(s):
    var c = int(r)
    if c <=% 127: add(result, chr(c))
    elif c <% translationTable.len: add(result, translationTable[c-128])

when isMainModule:
  loadUnidecodeTable("lib/pure/unidecode/unidecode.dat")
  assert unidecode("Äußerst") == "Ausserst"
lass="p">)..L-3: if ident.id == considerQuotedIdent(forLoop[i]).id: var call = forLoop.sons[L-2] var tupl = call.sons[i+1-ord(c.replaceByFieldName)] if c.field.isNil: result = newNodeI(nkBracketExpr, n.info) result.add(tupl) result.add(newIntNode(nkIntLit, c.tupleIndex)) else: result = newNodeI(nkDotExpr, n.info) result.add(tupl) result.add(newSymNode(c.field, n.info)) break else: if n.kind == nkContinueStmt: localError(n.info, errGenerated, "'continue' not supported in a 'fields' loop") result = copyNode(n) newSons(result, sonsLen(n)) for i in countup(0, sonsLen(n)-1): result.sons[i] = instFieldLoopBody(c, n.sons[i], forLoop) type TFieldsCtx = object c: PContext m: TMagic proc semForObjectFields(c: TFieldsCtx, typ, forLoop, father: PNode) = case typ.kind of nkSym: var fc: TFieldInstCtx # either 'tup[i]' or 'field' is valid fc.field = typ.sym fc.replaceByFieldName = c.m == mFieldPairs openScope(c.c) inc c.c.inUnrolledContext let body = instFieldLoopBody(fc, lastSon(forLoop), forLoop) father.add(semStmt(c.c, body)) dec c.c.inUnrolledContext closeScope(c.c) of nkNilLit: discard of nkRecCase: let L = forLoop.len let call = forLoop.sons[L-2] if call.len > 2: localError(forLoop.info, errGenerated, "parallel 'fields' iterator does not work for 'case' objects") return # iterate over the selector: semForObjectFields(c, typ[0], forLoop, father) # we need to generate a case statement: var caseStmt = newNodeI(nkCaseStmt, forLoop.info) # generate selector: var access = newNodeI(nkDotExpr, forLoop.info, 2) access.sons[0] = call.sons[1] access.sons[1] = newSymNode(typ.sons[0].sym, forLoop.info) caseStmt.add(semExprWithType(c.c, access)) # copy the branches over, but replace the fields with the for loop body: for i in 1 .. <typ.len: var branch = copyTree(typ[i]) let L = branch.len branch.sons[L-1] = newNodeI(nkStmtList, forLoop.info) semForObjectFields(c, typ[i].lastSon, forLoop, branch[L-1]) caseStmt.add(branch) father.add(caseStmt) of nkRecList: for t in items(typ): semForObjectFields(c, t, forLoop, father) else: illFormedAstLocal(typ) proc semForFields(c: PContext, n: PNode, m: TMagic): PNode = # so that 'break' etc. work as expected, we produce # a 'while true: stmt; break' loop ... result = newNodeI(nkWhileStmt, n.info, 2) var trueSymbol = strTableGet(magicsys.systemModule.tab, getIdent"true") if trueSymbol == nil: localError(n.info, errSystemNeeds, "true") trueSymbol = newSym(skUnknown, getIdent"true", getCurrOwner(), n.info) trueSymbol.typ = getSysType(tyBool) result.sons[0] = newSymNode(trueSymbol, n.info) var stmts = newNodeI(nkStmtList, n.info) result.sons[1] = stmts var length = sonsLen(n) var call = n.sons[length-2] if length-2 != sonsLen(call)-1 + ord(m==mFieldPairs): localError(n.info, errWrongNumberOfVariables) return result var tupleTypeA = skipTypes(call.sons[1].typ, abstractVar-{tyTypeDesc}) if tupleTypeA.kind notin {tyTuple, tyObject}: localError(n.info, errGenerated, "no object or tuple type") return result for i in 1..call.len-1: var tupleTypeB = skipTypes(call.sons[i].typ, abstractVar-{tyTypeDesc}) if not sameType(tupleTypeA, tupleTypeB): typeMismatch(call.sons[i], tupleTypeA, tupleTypeB) inc(c.p.nestedLoopCounter) if tupleTypeA.kind == tyTuple: var loopBody = n.sons[length-1] for i in 0..sonsLen(tupleTypeA)-1: openScope(c) var fc: TFieldInstCtx fc.tupleType = tupleTypeA fc.tupleIndex = i fc.replaceByFieldName = m == mFieldPairs var body = instFieldLoopBody(fc, loopBody, n) inc c.inUnrolledContext stmts.add(semStmt(c, body)) dec c.inUnrolledContext closeScope(c) else: var fc: TFieldsCtx fc.m = m fc.c = c var t = tupleTypeA while t.kind == tyObject: semForObjectFields(fc, t.n, n, stmts) if t.sons[0] == nil: break t = skipTypes(t.sons[0], skipPtrs) dec(c.p.nestedLoopCounter) # for TR macros this 'while true: ...; break' loop is pretty bad, so # we avoid it now if we can: if containsNode(stmts, {nkBreakStmt}): var b = newNodeI(nkBreakStmt, n.info) b.add(ast.emptyNode) stmts.add(b) else: result = stmts