From ccbe8b8d13ebdad09d282da51d118670a566cba5 Mon Sep 17 00:00:00 2001 From: hut Date: Tue, 4 May 2010 23:29:54 +0200 Subject: attempt to fix utf issues (wrong calculation of width) --- ranger/ext/utfwidth.py | 64 +++++++++++++++++++++++++++++++++++++++++++ ranger/gui/bar.py | 4 ++- ranger/gui/widgets/console.py | 5 ++-- test/tc_utfwidth.py | 42 ++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 3 deletions(-) create mode 100644 ranger/ext/utfwidth.py create mode 100644 test/tc_utfwidth.py diff --git a/ranger/ext/utfwidth.py b/ranger/ext/utfwidth.py new file mode 100644 index 00000000..bbc67deb --- /dev/null +++ b/ranger/ext/utfwidth.py @@ -0,0 +1,64 @@ +# -*- encoding: utf8 -*- +# Copyright (C) 2009, 2010 Roman Zimbelmann +# Copyright (C) 2004, 2005 Timo Hirvonen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# ---- +# This file contains portions of code from cmus (uchar.c). + +NARROW = 1 +WIDE = 2 + +def utf_byte_length(string): + """Return the byte length of one utf character""" + firstord = ord(string[0]) + if firstord < 0x01111111: + return 1 + if firstord < 0x10111111: + return 0 # invalid + if firstord < 0x11011111: + return min(2, len(string)) + if firstord < 0x11101111: + return min(3, len(string)) + if firstord < 0x11110100: + return min(4, len(string)) + return 0 # invalid + +def utf_char_width(string): + # XXX + u = _utf_char_to_int(string) + if u < 0x1100: + return NARROW + else: + return WIDE + +def _utf_char_to_int(string): + u = 0 + for c in string: + u = (u << 6) | (ord(c) & 0b00111111) + return u + +def uwid(string): + end = len(string) + i = 0 + width = 0 + while i < end: + bytelen = utf_byte_length(string[i:]) + if bytelen: + width += utf_char_width(string[i:i+bytelen]) + else: + width += 1 + i += bytelen + return width diff --git a/ranger/gui/bar.py b/ranger/gui/bar.py index f5e34eb1..03ed2f78 100644 --- a/ranger/gui/bar.py +++ b/ranger/gui/bar.py @@ -13,6 +13,8 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +from ranger.ext.utfwidth import uwid + class Bar(object): left = None right = None @@ -132,7 +134,7 @@ class ColoredString(object): self.string = self.string[:n] def __len__(self): - return len(self.string) + return uwid(self.string) def __str__(self): return self.string diff --git a/ranger/gui/widgets/console.py b/ranger/gui/widgets/console.py index fa9e438e..30872639 100644 --- a/ranger/gui/widgets/console.py +++ b/ranger/gui/widgets/console.py @@ -27,6 +27,7 @@ from ranger.gui.widgets.console_mode import is_valid_mode, mode_to_class from ranger import log, relpath_conf from ranger.core.runner import ALLOWED_FLAGS from ranger.ext.shell_escape import shell_quote +from ranger.ext.utfwidth import uwid from ranger.container.keymap import CommandArgs from ranger.ext.get_executables import get_executables from ranger.ext.direction import Direction @@ -105,8 +106,8 @@ class Console(Widget): def finalize(self): try: - self.fm.ui.win.move(self.y, - self.x + min(self.wid-1, self.pos + len(self.prompt))) + xpos = uwid(self.line[0:self.pos]) + len(self.prompt) + self.fm.ui.win.move(self.y, self.x + min(self.wid-1, xpos)) except: pass diff --git a/test/tc_utfwidth.py b/test/tc_utfwidth.py new file mode 100644 index 00000000..cf564990 --- /dev/null +++ b/test/tc_utfwidth.py @@ -0,0 +1,42 @@ +# -*- encoding: utf8 -*- +# Copyright (C) 2009, 2010 Roman Zimbelmann +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +if __name__ == '__main__': from __init__ import init; init() + +from unittest import TestCase, main +from ranger.ext.utfwidth import * + +a_ascii = "a" # width = 1, bytes = 1 +a_umlaut = "ä" # width = 1, bytes = 2 +a_katakana = "ア" # width = 2, bytes = 3 +# need one with width = 1 & bytes = 3 + +class Test(TestCase): + def test_utf_byte_length(self): + self.assertEqual(1, utf_byte_length(a_ascii[0])) + self.assertEqual(2, utf_byte_length(a_umlaut[0])) + self.assertEqual(3, utf_byte_length(a_katakana[0])) + + def test_uwid(self): + self.assertEqual(1, uwid(a_ascii)) + self.assertEqual(1, uwid(a_umlaut)) + self.assertEqual(2, uwid(a_katakana)) + self.assertEqual(3, uwid(a_katakana + a_umlaut)) + self.assertEqual(4, uwid("asdf")) + self.assertEqual(5, uwid("löööl")) + self.assertEqual(6, uwid("バババ")) + +if __name__ == '__main__': main() -- cgit 1.4.1-2-gfad0 From 30c8fb813f98268d93a091365f051413e4160155 Mon Sep 17 00:00:00 2001 From: hut Date: Thu, 6 May 2010 12:29:28 +0200 Subject: utf stuff --- ranger/ext/utfwidth.py | 18 +++++++++++++----- ranger/gui/widgets/console.py | 4 +++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ranger/ext/utfwidth.py b/ranger/ext/utfwidth.py index bbc67deb..2881a2a0 100644 --- a/ranger/ext/utfwidth.py +++ b/ranger/ext/utfwidth.py @@ -27,14 +27,14 @@ def utf_byte_length(string): if firstord < 0x01111111: return 1 if firstord < 0x10111111: - return 0 # invalid + return 1 # invalid if firstord < 0x11011111: return min(2, len(string)) if firstord < 0x11101111: return min(3, len(string)) if firstord < 0x11110100: return min(4, len(string)) - return 0 # invalid + return 1 # invalid def utf_char_width(string): # XXX @@ -45,6 +45,7 @@ def utf_char_width(string): return WIDE def _utf_char_to_int(string): + # Squash the last 6 bits of each byte together to an integer u = 0 for c in string: u = (u << 6) | (ord(c) & 0b00111111) @@ -56,9 +57,16 @@ def uwid(string): width = 0 while i < end: bytelen = utf_byte_length(string[i:]) - if bytelen: - width += utf_char_width(string[i:i+bytelen]) - else: width += 1 i += bytelen return width + +def uchars(string): + end = len(string) + i = 0 + result = [] + while i < end: + bytelen = utf_byte_length(string[i:]) + result.append(string[i:i+bytelen]) + i += bytelen + return result diff --git a/ranger/gui/widgets/console.py b/ranger/gui/widgets/console.py index 30872639..5a538ce2 100644 --- a/ranger/gui/widgets/console.py +++ b/ranger/gui/widgets/console.py @@ -31,6 +31,7 @@ from ranger.ext.utfwidth import uwid from ranger.container.keymap import CommandArgs from ranger.ext.get_executables import get_executables from ranger.ext.direction import Direction +from ranger.ext.utfwidth import uwid, uchars from ranger.container import History from ranger.container.history import HistoryEmptyException import ranger @@ -264,7 +265,8 @@ class Console(Widget): self.close() pos = self.pos + mod - self.line = self.line[0:pos] + self.line[pos+1:] + chars = uchars(self.line) + self.line = ''.join(chars[0:pos] + chars[pos+1:]) self.move(right=mod) self.on_line_change() -- cgit 1.4.1-2-gfad0 From 9e435dcd5bcaf20f74f979f5ac79714172648226 Mon Sep 17 00:00:00 2001 From: hut Date: Mon, 17 May 2010 17:28:34 +0200 Subject: fixed utf stuff --- ranger/ext/utfwidth.py | 60 +++++++++++++++++++++++++++++++++++-------- ranger/gui/widgets/console.py | 9 ++++--- test/tc_utfwidth.py | 6 ++--- 3 files changed, 58 insertions(+), 17 deletions(-) diff --git a/ranger/ext/utfwidth.py b/ranger/ext/utfwidth.py index 2881a2a0..31440ef0 100644 --- a/ranger/ext/utfwidth.py +++ b/ranger/ext/utfwidth.py @@ -24,25 +24,61 @@ WIDE = 2 def utf_byte_length(string): """Return the byte length of one utf character""" firstord = ord(string[0]) - if firstord < 0x01111111: + if firstord < 0b01111111: return 1 - if firstord < 0x10111111: + if firstord < 0b10111111: return 1 # invalid - if firstord < 0x11011111: - return min(2, len(string)) - if firstord < 0x11101111: - return min(3, len(string)) - if firstord < 0x11110100: - return min(4, len(string)) + if firstord < 0b11011111: + return 2 + if firstord < 0b11101111: + return 3 + if firstord < 0b11110100: + return 4 return 1 # invalid def utf_char_width(string): - # XXX + """Return the width of a single character""" + # Inspired by cmus uchar.c u = _utf_char_to_int(string) if u < 0x1100: return NARROW - else: + # Hangul Jamo init. constonants + if u <= 0x115F: return WIDE + # Angle Brackets + if u == 0x2329 or u == 0x232A: + return WIDE + if u < 0x2e80: + return NARROW + # CJK ... Yi + if u < 0x302A: + return WIDE + if u <= 0x302F: + return NARROW + if u == 0x303F or u == 0x3099 or u == 0x309a: + return NARROW + # CJK ... Yi + if u <= 0xA4CF: + return WIDE + # Hangul Syllables + if u >= 0xAC00 and u <= 0xD7A3: + return WIDE + # CJK Compatibility Ideographs + if u >= 0xF900 and u <= 0xFAFF: + return WIDE + # CJK Compatibility Forms + if u >= 0xFE30 and u <= 0xFE6F: + return WIDE + # Fullwidth Forms + if u >= 0xFF00 and u <= 0xFF60 or u >= 0xFFE0 and u <= 0xFFE6: + return WIDE + # CJK Extra Stuff + if u >= 0x20000 and u <= 0x2FFFD: + return WIDE + # ? + if u >= 0x30000 and u <= 0x3FFFD: + return WIDE + return NARROW # invalid def _utf_char_to_int(string): # Squash the last 6 bits of each byte together to an integer @@ -52,16 +88,18 @@ def _utf_char_to_int(string): return u def uwid(string): + """Return the width of a string""" end = len(string) i = 0 width = 0 while i < end: bytelen = utf_byte_length(string[i:]) - width += 1 + width += utf_char_width(string[i:i+bytelen]) i += bytelen return width def uchars(string): + """Return a list with one string for each character""" end = len(string) i = 0 result = [] diff --git a/ranger/gui/widgets/console.py b/ranger/gui/widgets/console.py index 5a538ce2..51ecf3b2 100644 --- a/ranger/gui/widgets/console.py +++ b/ranger/gui/widgets/console.py @@ -223,11 +223,14 @@ class Console(Widget): def move(self, **keywords): direction = Direction(keywords) if direction.horizontal(): - self.pos = direction.move( + uc = uchars(self.line) + upos = len(uchars(self.line[:self.pos])) + newupos = direction.move( direction=direction.right(), minimum=0, - maximum=len(self.line) + 1, - current=self.pos) + maximum=len(uc) + 1, + current=upos) + self.pos = len(''.join(uc[:newupos])) def delete_rest(self, direction): self.tab_deque = None diff --git a/test/tc_utfwidth.py b/test/tc_utfwidth.py index cf564990..d8ffbe1d 100644 --- a/test/tc_utfwidth.py +++ b/test/tc_utfwidth.py @@ -26,9 +26,9 @@ a_katakana = "ア" # width = 2, bytes = 3 class Test(TestCase): def test_utf_byte_length(self): - self.assertEqual(1, utf_byte_length(a_ascii[0])) - self.assertEqual(2, utf_byte_length(a_umlaut[0])) - self.assertEqual(3, utf_byte_length(a_katakana[0])) + self.assertEqual(1, utf_byte_length(a_ascii)) + self.assertEqual(2, utf_byte_length(a_umlaut)) + self.assertEqual(3, utf_byte_length(a_katakana)) def test_uwid(self): self.assertEqual(1, uwid(a_ascii)) -- cgit 1.4.1-2-gfad0 From 3ac9d6c841c47cebaab028557a217911d6cecd0e Mon Sep 17 00:00:00 2001 From: hut Date: Mon, 17 May 2010 17:36:16 +0200 Subject: utf: fixed console.delete() --- ranger/gui/widgets/console.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ranger/gui/widgets/console.py b/ranger/gui/widgets/console.py index 51ecf3b2..48116187 100644 --- a/ranger/gui/widgets/console.py +++ b/ranger/gui/widgets/console.py @@ -264,13 +264,15 @@ class Console(Widget): def delete(self, mod): self.tab_deque = None - if mod == -1 and len(self.line) == 0: - self.close() - pos = self.pos + mod - - chars = uchars(self.line) - self.line = ''.join(chars[0:pos] + chars[pos+1:]) - self.move(right=mod) + if mod == -1 and self.pos == 0: + if not self.line: + self.close() + return + uc = uchars(self.line) + upos = len(uchars(self.line[:self.pos])) + mod + left_part = ''.join(uc[:upos]) + self.pos = len(left_part) + self.line = left_part + ''.join(uc[upos+1:]) self.on_line_change() def execute(self): -- cgit 1.4.1-2-gfad0 From 637fc4a1011b0f8b66ee213de1a1c3343d1bad0e Mon Sep 17 00:00:00 2001 From: hut Date: Mon, 17 May 2010 17:49:19 +0200 Subject: utf: reordering --- ranger/ext/utfwidth.py | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/ranger/ext/utfwidth.py b/ranger/ext/utfwidth.py index 31440ef0..a506c676 100644 --- a/ranger/ext/utfwidth.py +++ b/ranger/ext/utfwidth.py @@ -21,6 +21,28 @@ NARROW = 1 WIDE = 2 +def uwid(string): + """Return the width of a string""" + end = len(string) + i = 0 + width = 0 + while i < end: + bytelen = utf_byte_length(string[i:]) + width += utf_char_width(string[i:i+bytelen]) + i += bytelen + return width + +def uchars(string): + """Return a list with one string for each character""" + end = len(string) + i = 0 + result = [] + while i < end: + bytelen = utf_byte_length(string[i:]) + result.append(string[i:i+bytelen]) + i += bytelen + return result + def utf_byte_length(string): """Return the byte length of one utf character""" firstord = ord(string[0]) @@ -38,7 +60,6 @@ def utf_byte_length(string): def utf_char_width(string): """Return the width of a single character""" - # Inspired by cmus uchar.c u = _utf_char_to_int(string) if u < 0x1100: return NARROW @@ -78,7 +99,7 @@ def utf_char_width(string): # ? if u >= 0x30000 and u <= 0x3FFFD: return WIDE - return NARROW # invalid + return NARROW # invalid (?) def _utf_char_to_int(string): # Squash the last 6 bits of each byte together to an integer @@ -86,25 +107,3 @@ def _utf_char_to_int(string): for c in string: u = (u << 6) | (ord(c) & 0b00111111) return u - -def uwid(string): - """Return the width of a string""" - end = len(string) - i = 0 - width = 0 - while i < end: - bytelen = utf_byte_length(string[i:]) - width += utf_char_width(string[i:i+bytelen]) - i += bytelen - return width - -def uchars(string): - """Return a list with one string for each character""" - end = len(string) - i = 0 - result = [] - while i < end: - bytelen = utf_byte_length(string[i:]) - result.append(string[i:i+bytelen]) - i += bytelen - return result -- cgit 1.4.1-2-gfad0 From d0e56912638b2a90bb960574ceb5c0bbbf415aca Mon Sep 17 00:00:00 2001 From: hut Date: Mon, 17 May 2010 18:52:51 +0200 Subject: utf: partial fix of console width overflow handling --- ranger/gui/widgets/console.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ranger/gui/widgets/console.py b/ranger/gui/widgets/console.py index 48116187..3bb41482 100644 --- a/ranger/gui/widgets/console.py +++ b/ranger/gui/widgets/console.py @@ -99,8 +99,9 @@ class Console(Widget): self.win.erase() self.addstr(0, 0, self.prompt) - overflow = -self.wid + len(self.prompt) + len(self.line) + 1 + overflow = -self.wid + len(self.prompt) + uwid(self.line) + 1 if overflow > 0: + #XXX: cut uft-char-wise, consider width self.addstr(self.line[overflow:]) else: self.addstr(self.line) @@ -223,6 +224,7 @@ class Console(Widget): def move(self, **keywords): direction = Direction(keywords) if direction.horizontal(): + # Ensure that the pointer is moved utf-char-wise uc = uchars(self.line) upos = len(uchars(self.line[:self.pos])) newupos = direction.move( @@ -268,6 +270,7 @@ class Console(Widget): if not self.line: self.close() return + # Delete utf-char-wise uc = uchars(self.line) upos = len(uchars(self.line[:self.pos])) + mod left_part = ''.join(uc[:upos]) -- cgit 1.4.1-2-gfad0 From d213c6b6eb252df03e4fbbe97e7e71876da8c309 Mon Sep 17 00:00:00 2001 From: hut Date: Mon, 17 May 2010 20:09:42 +0200 Subject: main: removed unnecessary complex locale handling --- ranger/__main__.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/ranger/__main__.py b/ranger/__main__.py index 887f8e28..f941299e 100644 --- a/ranger/__main__.py +++ b/ranger/__main__.py @@ -19,6 +19,7 @@ import os import sys import ranger +import locale from optparse import OptionParser, SUPPRESS_HELP from ranger.ext.openstruct import OpenStruct @@ -153,14 +154,7 @@ def main(): print('ranger requires the python curses module. Aborting.') sys.exit(1) - # Ensure that a utf8 locale is set. - if getdefaultlocale()[1] not in ('utf8', 'UTF-8'): - for locale in ('en_US.utf8', 'en_US.UTF-8'): - try: setlocale(LC_ALL, locale) - except: pass - else: break - else: setlocale(LC_ALL, '') - else: setlocale(LC_ALL, '') + locale.setlocale(locale.LC_ALL, '') arg = parse_arguments() ranger.arg = arg -- cgit 1.4.1-2-gfad0