fixed utf stuff

author: hut <hut@lavabit.com> 2010-05-17 17:28:34 +0200
committer: hut <hut@lavabit.com> 2010-05-17 17:29:16 +0200
commit: 9e435dcd5bcaf20f74f979f5ac79714172648226 (patch)
tree: 9433856d57c90e4495b828a0d3b8a27baf13caf6 /ranger/ext/utfwidth.py
parent: 30c8fb813f98268d93a091365f051413e4160155 (diff)
download: ranger-9e435dcd5bcaf20f74f979f5ac79714172648226.tar.gz
1 files changed, 49 insertions, 11 deletions
diff --git a/ranger/ext/utfwidth.py b/ranger/ext/utfwidth.py
index 2881a2a0..31440ef0 100644
--- a/ranger/ext/utfwidth.py
+++ b/ranger/ext/utfwidth.py
@@ -24,25 +24,61 @@ WIDE = 2
 def utf_byte_length(string):
 	"""Return the byte length of one utf character"""
 	firstord = ord(string[0])
-	if firstord < 0x01111111:
+	if firstord < 0b01111111:
 		return 1
-	if firstord < 0x10111111:
+	if firstord < 0b10111111:
 		return 1  # invalid
-	if firstord < 0x11011111:
-		return min(2, len(string))
-	if firstord < 0x11101111:
-		return min(3, len(string))
-	if firstord < 0x11110100:
-		return min(4, len(string))
+	if firstord < 0b11011111:
+		return 2
+	if firstord < 0b11101111:
+		return 3
+	if firstord < 0b11110100:
+		return 4
 	return 1  # invalid
 
 def utf_char_width(string):
-	# XXX
+	"""Return the width of a single character"""
+	# Inspired by cmus uchar.c
 	u = _utf_char_to_int(string)
 	if u < 0x1100:
 		return NARROW
-	else:
+	# Hangul Jamo init. constonants
+	if u <= 0x115F:
 		return WIDE
+	# Angle Brackets
+	if u == 0x2329 or u == 0x232A:
+		return WIDE
+	if u < 0x2e80:
+		return NARROW
+	# CJK ... Yi
+	if u < 0x302A:
+		return WIDE
+	if u <= 0x302F:
+		return NARROW
+	if u == 0x303F or u == 0x3099 or u == 0x309a:
+		return NARROW
+	# CJK ... Yi
+	if u <= 0xA4CF:
+		return WIDE
+	# Hangul Syllables
+	if u >= 0xAC00 and u <= 0xD7A3:
+		return WIDE
+	# CJK Compatibility Ideographs
+	if u >= 0xF900 and u <= 0xFAFF:
+		return WIDE
+	# CJK Compatibility Forms
+	if u >= 0xFE30 and u <= 0xFE6F:
+		return WIDE
+	# Fullwidth Forms
+	if u >= 0xFF00 and u <= 0xFF60 or u >= 0xFFE0 and u <= 0xFFE6:
+		return WIDE
+	# CJK Extra Stuff
+	if u >= 0x20000 and u <= 0x2FFFD:
+		return WIDE
+	# ?
+	if u >= 0x30000 and u <= 0x3FFFD:
+		return WIDE
+	return NARROW  # invalid
 
 def _utf_char_to_int(string):
 	# Squash the last 6 bits of each byte together to an integer
@@ -52,16 +88,18 @@ def _utf_char_to_int(string):
 	return u
 
 def uwid(string):
+	"""Return the width of a string"""
 	end = len(string)
 	i = 0
 	width = 0
 	while i < end:
 		bytelen = utf_byte_length(string[i:])
-			width += 1
+		width += utf_char_width(string[i:i+bytelen])
 		i += bytelen
 	return width
 
 def uchars(string):
+	"""Return a list with one string for each character"""
 	end = len(string)
 	i = 0
 	result = []
author	hut <hut@lavabit.com>	2010-05-17 17:28:34 +0200
committer	hut <hut@lavabit.com>	2010-05-17 17:29:16 +0200
commit	9e435dcd5bcaf20f74f979f5ac79714172648226 (patch)
tree	9433856d57c90e4495b828a0d3b8a27baf13caf6 /ranger/ext/utfwidth.py
parent	30c8fb813f98268d93a091365f051413e4160155 (diff)
download	ranger-9e435dcd5bcaf20f74f979f5ac79714172648226.tar.gz