blob: 364db7573c984c29a56ca8080a1421dd453ff96d (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
# -*- encoding: utf8 -*-
# Copyright (C) 2009, 2010 Roman Zimbelmann <romanz@lavabit.com>
# Copyright (C) 2004, 2005 Timo Hirvonen
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# ----
# This file contains portions of code from cmus (uchar.c).
try:
from sys import maxint
except:
from sys import maxsize as maxint
NARROW = 1
WIDE = 2
def uwid(string, count=maxint):
"""Return the width of a string"""
end = len(string)
i = 0
width = 0
while i < end and count:
bytelen = utf_byte_length(string[i:])
width += utf_char_width(string[i:i+bytelen])
i += bytelen
count -= 1
return width
def uchars(string):
"""Return a list with one string for each character"""
end = len(string)
i = 0
result = []
while i < end:
bytelen = utf_byte_length(string[i:])
result.append(string[i:i+bytelen])
i += bytelen
return result
def uwidslice(string, start=0, end=maxint):
chars = []
for c in uchars(string):
c_wid = utf_char_width(c)
if c_wid == NARROW:
chars.append(c)
elif c_wid == WIDE:
chars.append("")
chars.append(c)
return "".join(chars[start:end])
def utf_byte_length(string):
"""Return the byte length of one utf character"""
firstord = ord(string[0])
if firstord < 0b01111111:
return 1
if firstord < 0b10111111:
return 1 # invalid
if firstord < 0b11011111:
return 2
if firstord < 0b11101111:
return 3
if firstord < 0b11110100:
return 4
return 1 # invalid
def utf_char_width(string):
"""Return the width of a single character"""
u = _utf_char_to_int(string)
if u < 0x1100:
return NARROW
# Hangul Jamo init. constonants
if u <= 0x115F:
return WIDE
# Angle Brackets
if u == 0x2329 or u == 0x232A:
return WIDE
if u < 0x2e80:
return NARROW
# CJK ... Yi
if u < 0x302A:
return WIDE
if u <= 0x302F:
return NARROW
if u == 0x303F or u == 0x3099 or u == 0x309a:
return NARROW
# CJK ... Yi
if u <= 0xA4CF:
return WIDE
# Hangul Syllables
if u >= 0xAC00 and u <= 0xD7A3:
return WIDE
# CJK Compatibility Ideographs
if u >= 0xF900 and u <= 0xFAFF:
return WIDE
# CJK Compatibility Forms
if u >= 0xFE30 and u <= 0xFE6F:
return WIDE
# Fullwidth Forms
if u >= 0xFF00 and u <= 0xFF60 or u >= 0xFFE0 and u <= 0xFFE6:
return WIDE
# CJK Extra Stuff
if u >= 0x20000 and u <= 0x2FFFD:
return WIDE
# ?
if u >= 0x30000 and u <= 0x3FFFD:
return WIDE
return NARROW # invalid (?)
def _utf_char_to_int(string):
# Squash the last 6 bits of each byte together to an integer
u = 0
for c in string:
u = (u << 6) | (ord(c) & 0b00111111)
return u
|