diff options
author | toonn <toonn@toonn.io> | 2018-10-28 13:46:29 +0100 |
---|---|---|
committer | toonn <toonn@toonn.io> | 2018-10-28 13:53:26 +0100 |
commit | 55099758427b04201510774b4526bd2b0d331b48 (patch) | |
tree | 5359cbf6041d8ba79169fafeeef231ee3e3eb750 /ranger | |
parent | 9b1af8452642267ae982853ba331bb8fd8f22320 (diff) | |
download | ranger-55099758427b04201510774b4526bd2b0d331b48.tar.gz |
Refactor encoding detection
Diffstat (limited to 'ranger')
-rw-r--r-- | ranger/core/actions.py | 41 |
1 files changed, 22 insertions, 19 deletions
diff --git a/ranger/core/actions.py b/ranger/core/actions.py index 7e5765b0..5358a40f 100644 --- a/ranger/core/actions.py +++ b/ranger/core/actions.py @@ -1167,33 +1167,36 @@ class Actions( # pylint: disable=too-many-instance-attributes,too-many-public-m @staticmethod def read_text_file(path, count=None): """Encoding-aware reading of a text file.""" + # Guess encoding ourselves. + # These should be the most frequently used ones. + # latin-1 as the last resort + encodings = [ ('utf-8', 'strict') + , ('utf-16', 'strict') + , ('latin-1', 'replace') + ] + + with open(path, 'rb') as fobj: + data = fobj.read(count) + try: import chardet except ImportError: - # Guess encoding ourselves. - # These should be the most frequently used ones. - encodings = ('utf-8', 'utf-16') - for encoding in encodings: - try: - with codecs.open(path, 'r', encoding=encoding) as fobj: - text = fobj.read(count) - except UnicodeDecodeError: - pass - else: - LOG.debug("guessed encoding of '%s' as %r", path, encoding) - return text + pass else: - with open(path, 'rb') as fobj: - data = fobj.read(count) result = chardet.detect(data) guessed_encoding = result['encoding'] if guessed_encoding is not None: - LOG.debug("chardet guess for '%s': %s", path, result) - return codecs.decode(data, guessed_encoding, 'replace') + # Add chardet's guess before our own. + encodings.insert(0, (guessed_encoding, 'replace')) - # latin-1 as the last resort - with codecs.open(path, 'r', encoding='latin-1', errors='replace') as fobj: - return fobj.read(count) + for (encoding, error_scheme) in encodings: + try: + text = codecs.decode(data, encoding, error_scheme) + except UnicodeDecodeError: + pass + else: + LOG.debug("Guessed encoding of '%s' as %s", path, encoding) + return text # -------------------------- # -- Tabs |