Add optional encoding detection with chardet

If it's not available, test utf-8 and utf-16. If everything fails, fall back to latin1 as previously. Fixes #990.
author: FichteFoll <fichtefoll2@googlemail.com> 2018-02-24 20:00:36 +0100
committer: FichteFoll <fichtefoll2@googlemail.com> 2018-03-09 02:10:30 +0100
commit: 08b08d70f8b2f73d5f37c749774b9f16f0c82dbe (patch)
tree: 8eb8229ac059a17bd089061dbd4d52145d3db351
parent: f855979587bd918f0d32c0caba79ab4b4aa531cf (diff)
download: ranger-08b08d70f8b2f73d5f37c749774b9f16f0c82dbe.tar.gz
2 files changed, 31 insertions, 8 deletions
diff --git a/README.md b/README.md
index ef644ae6..de0514d3 100644
--- a/README.md
+++ b/README.md
@@ -79,6 +79,7 @@ Optional, for enhanced file previews (with `scope.sh`):
 * `transmission-show` for viewing bit-torrent information
 * `mediainfo` or `exiftool` for viewing information about media files
 * `odt2txt` for OpenDocument text files (`odt`, `ods`, `odp` and `sxw`)
+* `chardet` (Python package) for improved encoding detection of text files
 
 
 Installing
diff --git a/ranger/core/actions.py b/ranger/core/actions.py
index 6bbb35aa..306a6166 100644
--- a/ranger/core/actions.py
+++ b/ranger/core/actions.py
@@ -1063,14 +1063,7 @@ class Actions(  # pylint: disable=too-many-instance-attributes,too-many-public-m
                 data[(-1, -1)] = None
                 data['foundpreview'] = False
             elif rcode == 2:
-                fobj = codecs.open(path, 'r', errors='ignore')
-                try:
-                    data[(-1, -1)] = fobj.read(1024 * 32)
-                except UnicodeDecodeError:
-                    fobj.close()
-                    fobj = codecs.open(path, 'r', encoding='latin-1', errors='ignore')
-                    data[(-1, -1)] = fobj.read(1024 * 32)
-                fobj.close()
+                data[(-1, -1)] = self.read_text_file(path, 1024 * 32)
             else:
                 data[(-1, -1)] = None
 
@@ -1111,6 +1104,35 @@ class Actions(  # pylint: disable=too-many-instance-attributes,too-many-public-m
 
         return None
 
+    @staticmethod
+    def read_text_file(path, count=None):
+        """Encoding-aware reading of a text file."""
+        try:
+            import chardet
+        except ImportError:
+            # Guess encoding ourselves. These should be the most frequently used ones.
+            encodings = ('utf-8', 'utf-16')
+            for encoding in encodings:
+                try:
+                    with codecs.open(path, 'r', encoding=encoding) as fobj:
+                        text = fobj.read(count)
+                except UnicodeDecodeError:
+                    pass
+                else:
+                    LOG.debug("guessed encoding of '%s' as %r", path, encoding)
+                    return text
+        else:
+            with open(path, 'rb') as fobj:
+                data = fobj.read(count)
+            result = chardet.detect(data)
+            LOG.debug("chardet guess for '%s': %s", path, result)
+            guessed_encoding = result['encoding']
+            return codecs.decode(data, guessed_encoding, 'replace')
+
+        # latin-1 as the last resort
+        with codecs.open(path, 'r', encoding='latin-1', errors='replace') as fobj:
+            return fobj.read(count)
+
     # --------------------------
     # -- Tabs
     # --------------------------
author	FichteFoll <fichtefoll2@googlemail.com>	2018-02-24 20:00:36 +0100
committer	FichteFoll <fichtefoll2@googlemail.com>	2018-03-09 02:10:30 +0100
commit	08b08d70f8b2f73d5f37c749774b9f16f0c82dbe (patch)
tree	8eb8229ac059a17bd089061dbd4d52145d3db351
parent	f855979587bd918f0d32c0caba79ab4b4aa531cf (diff)
download	ranger-08b08d70f8b2f73d5f37c749774b9f16f0c82dbe.tar.gz