about summary refs log tree commit diff stats
path: root/ranger/ext/utfwidth.py
blob: 364db7573c984c29a56ca8080a1421dd453ff96d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- encoding: utf8 -*-
# Copyright (C) 2009, 2010  Roman Zimbelmann <romanz@lavabit.com>
# Copyright (C) 2004, 2005  Timo Hirvonen
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# ----
# This file contains portions of code from cmus (uchar.c).

try:
	from sys import maxint
except:
	from sys import maxsize as maxint

NARROW = 1
WIDE = 2

def uwid(string, count=maxint):
	"""Return the width of a string"""
	end = len(string)
	i = 0
	width = 0
	while i < end and count:
		bytelen = utf_byte_length(string[i:])
		width += utf_char_width(string[i:i+bytelen])
		i += bytelen
		count -= 1
	return width

def uchars(string):
	"""Return a list with one string for each character"""
	end = len(string)
	i = 0
	result = []
	while i < end:
		bytelen = utf_byte_length(string[i:])
		result.append(string[i:i+bytelen])
		i += bytelen
	return result

def uwidslice(string, start=0, end=maxint):
	chars = []
	for c in uchars(string):
		c_wid = utf_char_width(c)
		if c_wid == NARROW:
			chars.append(c)
		elif c_wid == WIDE:
			chars.append("")
			chars.append(c)
	return "".join(chars[start:end])

def utf_byte_length(string):
	"""Return the byte length of one utf character"""
	firstord = ord(string[0])
	if firstord < 0b01111111:
		return 1
	if firstord < 0b10111111:
		return 1  # invalid
	if firstord < 0b11011111:
		return 2
	if firstord < 0b11101111:
		return 3
	if firstord < 0b11110100:
		return 4
	return 1  # invalid

def utf_char_width(string):
	"""Return the width of a single character"""
	u = _utf_char_to_int(string)
	if u < 0x1100:
		return NARROW
	# Hangul Jamo init. constonants
	if u <= 0x115F:
		return WIDE
	# Angle Brackets
	if u == 0x2329 or u == 0x232A:
		return WIDE
	if u < 0x2e80:
		return NARROW
	# CJK ... Yi
	if u < 0x302A:
		return WIDE
	if u <= 0x302F:
		return NARROW
	if u == 0x303F or u == 0x3099 or u == 0x309a:
		return NARROW
	# CJK ... Yi
	if u <= 0xA4CF:
		return WIDE
	# Hangul Syllables
	if u >= 0xAC00 and u <= 0xD7A3:
		return WIDE
	# CJK Compatibility Ideographs
	if u >= 0xF900 and u <= 0xFAFF:
		return WIDE
	# CJK Compatibility Forms
	if u >= 0xFE30 and u <= 0xFE6F:
		return WIDE
	# Fullwidth Forms
	if u >= 0xFF00 and u <= 0xFF60 or u >= 0xFFE0 and u <= 0xFFE6:
		return WIDE
	# CJK Extra Stuff
	if u >= 0x20000 and u <= 0x2FFFD:
		return WIDE
	# ?
	if u >= 0x30000 and u <= 0x3FFFD:
		return WIDE
	return NARROW  # invalid (?)

def _utf_char_to_int(string):
	# Squash the last 6 bits of each byte together to an integer
	u = 0
	for c in string:
		u = (u << 6) | (ord(c) & 0b00111111)
	return u