Added detection of Unicode BOM (Byte Order Mark) to determine the encoding used in imported files.

This commit is contained in:
Neugeniko
2015-11-14 01:00:32 +11:00
parent 89b756d760
commit dde8b1f802

View File

@@ -854,19 +854,34 @@ class Fit(object):
file = open(path, "r")
srcString = file.read()
codec_found = None
# If file had ANSI encoding, convert it to unicode using system
# default codepage, or use fallbacks UTF-16, then cp1252 on any
# encoding errors
# If file had ANSI encoding, decode it to unicode using detection
# of BOM header or if there is no header try default
# codepage then fallback to utf-16, cp1252
if isinstance(srcString, str):
attempt_codecs = (defcodepage, "utf-16", "cp1252")
for page in attempt_codecs:
try:
srcString = unicode(srcString, page)
codec_found = page
except UnicodeDecodeError:
logger.warn("Error unicode decoding %s from page %s, trying next codec", path, page)
else:
break
encoding_map = (('\xef\xbb\xbf', 'utf-8'),('\xff\xfe\0\0', 'utf-32'),('\0\0\xfe\xff', 'UTF-32BE'),('\xff\xfe', 'utf-16'),('\xfe\xff', 'UTF-16BE'))
for bom, encoding in encoding_map:
if srcString.startswith(bom):
codec_found = encoding
savebom = bom
if codec_found is None:
logger.warn("Unicode BOM not found in file %s.", path)
attempt_codecs = (defcodepage, "utf-16", "cp1252")
for page in attempt_codecs:
try:
logger.warn("Attempting to decode file %s using %s page.", path, page)
srcString = unicode(srcString, page)
codec_found = page
logger.warn("File %s decoded using %s page.", path, page)
except UnicodeDecodeError:
logger.warn("Error unicode decoding %s from page %s, trying next codec", path, page)
else:
break
else:
logger.debug("Unicode BOM detected in %s, using %s page.", path, codec_found)
srcString = unicode(srcString[len(savebom):], codec_found)
else:
# nasty hack to detect other transparent utf-16 loading
if srcString[0] == '<' and 'utf-16' in srcString[:128].lower():