Merge pull request #429 from Neugeniko/master

Added detection of Unicode BOM (Byte Order Mark) ...
2015-11-15 17:27:51 -05:00
parent b37aea40a7 dde8b1f802
commit 20759d205c
1 changed files with 27 additions and 12 deletions
--- a/service/fit.py
+++ b/service/fit.py
@@ -854,19 +854,34 @@ class Fit(object):
            file = open(path, "r")
            srcString = file.read()
            codec_found = None
-            # If file had ANSI encoding, convert it to unicode using system
+            # If file had ANSI encoding, decode it to unicode using detection
-            # default codepage, or use fallbacks UTF-16, then cp1252 on any
+            # of BOM header or if there is no header try default
-            # encoding errors
+            # codepage then fallback to utf-16, cp1252
            if isinstance(srcString, str):
-                attempt_codecs = (defcodepage, "utf-16", "cp1252")
+                encoding_map = (('\xef\xbb\xbf', 'utf-8'),('\xff\xfe\0\0', 'utf-32'),('\0\0\xfe\xff', 'UTF-32BE'),('\xff\xfe', 'utf-16'),('\xfe\xff', 'UTF-16BE'))
-                for page in attempt_codecs:
+                for bom, encoding in encoding_map:
-                    try:
+                    if srcString.startswith(bom):
-                        srcString = unicode(srcString, page)
+                        codec_found = encoding
-                        codec_found = page
+                        savebom = bom
-                    except UnicodeDecodeError:
+            
-                        logger.warn("Error unicode decoding %s from page %s, trying next codec", path, page)
+                if codec_found is None:
-                    else:
+                    logger.warn("Unicode BOM not found in file %s.", path)
-                        break
+                    attempt_codecs = (defcodepage, "utf-16", "cp1252")
                    for page in attempt_codecs:
                         try:
                             logger.warn("Attempting to decode file %s using %s page.", path, page)
                             srcString = unicode(srcString, page)
                             codec_found = page
                             logger.warn("File %s decoded using %s page.", path, page)
                         except UnicodeDecodeError:
                             logger.warn("Error unicode decoding %s from page %s, trying next codec", path, page)
                         else:
                             break
                else:
                    logger.debug("Unicode BOM detected in %s, using %s page.", path, codec_found)
                    srcString = unicode(srcString[len(savebom):], codec_found)
            else:
                # nasty hack to detect other transparent utf-16 loading
                if srcString[0] == '<' and 'utf-16' in srcString[:128].lower():