Merge pull request #429 from Neugeniko/master

Added detection of Unicode BOM (Byte Order Mark) ...
This commit is contained in:
blitzmann
2015-11-15 17:27:51 -05:00

View File

@@ -854,19 +854,34 @@ class Fit(object):
file = open(path, "r") file = open(path, "r")
srcString = file.read() srcString = file.read()
codec_found = None codec_found = None
# If file had ANSI encoding, convert it to unicode using system # If file had ANSI encoding, decode it to unicode using detection
# default codepage, or use fallbacks UTF-16, then cp1252 on any # of BOM header or if there is no header try default
# encoding errors # codepage then fallback to utf-16, cp1252
if isinstance(srcString, str): if isinstance(srcString, str):
attempt_codecs = (defcodepage, "utf-16", "cp1252") encoding_map = (('\xef\xbb\xbf', 'utf-8'),('\xff\xfe\0\0', 'utf-32'),('\0\0\xfe\xff', 'UTF-32BE'),('\xff\xfe', 'utf-16'),('\xfe\xff', 'UTF-16BE'))
for page in attempt_codecs: for bom, encoding in encoding_map:
try: if srcString.startswith(bom):
srcString = unicode(srcString, page) codec_found = encoding
codec_found = page savebom = bom
except UnicodeDecodeError:
logger.warn("Error unicode decoding %s from page %s, trying next codec", path, page) if codec_found is None:
else: logger.warn("Unicode BOM not found in file %s.", path)
break attempt_codecs = (defcodepage, "utf-16", "cp1252")
for page in attempt_codecs:
try:
logger.warn("Attempting to decode file %s using %s page.", path, page)
srcString = unicode(srcString, page)
codec_found = page
logger.warn("File %s decoded using %s page.", path, page)
except UnicodeDecodeError:
logger.warn("Error unicode decoding %s from page %s, trying next codec", path, page)
else:
break
else:
logger.debug("Unicode BOM detected in %s, using %s page.", path, codec_found)
srcString = unicode(srcString[len(savebom):], codec_found)
else: else:
# nasty hack to detect other transparent utf-16 loading # nasty hack to detect other transparent utf-16 loading
if srcString[0] == '<' and 'utf-16' in srcString[:128].lower(): if srcString[0] == '<' and 'utf-16' in srcString[:128].lower():