Allow utf8 in mappings

We were previously processing entries in mapping files (when
`--mappings-are-raw` is not given) with
`.decode('unicode_escape').encode('utf8')` to replace backslash escape
sequences in bytestrings with the utf-8 encoded characters they
represent. However, it turns out that `.decode
('unicode_escape')` assumes latin-1 encoding if it encounters non-ascii
bytes: https://bugs.python.org/issue21331. So this gave incorrect
results if non-ascii utf8 data was present in the mapping.

To fix this, we now add an extra layer of `.decode('utf8').encode
('unicode-escape')` in order to convert any non-ascii characters into
their backslash escape sequences. Then the subsequent
`.decode('unicode_escape')` only encounters ascii characters and gives
correct results.
This commit is contained in:
chrisjbillington
2020-03-25 12:31:16 -04:00
parent e51844cd65
commit 3b3f86b71e
2 changed files with 15 additions and 7 deletions

View File

@@ -426,12 +426,20 @@ def load_mapping(name, filename, mapping_is_raw):
return None
return (m.group(1).strip(), m.group(2).strip())
def process_unicode_escape_sequences(s):
# Replace unicode escape sequences in the otherwise UTF8-encoded bytestring s with
# the UTF8-encoded characters they represent. We need to do an additional
# .decode('utf8').encode('unicode-escape') to convert any non-ascii characters into
# their escape sequences so that the subsequent .decode('unicode-escape') succeeds:
return s.decode('utf8').encode('unicode-escape').decode('unicode-escape').encode('utf8')
def parse_quoted_line(line):
m=quoted_regexp.match(line)
if m==None:
return None
return (m.group(1).decode('unicode_escape').encode('utf8'),
m.group(5).decode('unicode_escape').encode('utf8'))
return
return (process_unicode_escape_sequences(m.group(1)),
process_unicode_escape_sequences(m.group(5)))
cache={}
if not os.path.exists(filename):