mirror of
				https://github.com/frej/fast-export.git
				synced 2025-10-31 16:35:48 +01:00 
			
		
		
		
	Allow utf8 in mappings
We were previously processing entries in mapping files (when
`--mappings-are-raw` is not given) with
`.decode('unicode_escape').encode('utf8')` to replace backslash escape
sequences in bytestrings with the utf-8 encoded characters they
represent. However, it turns out that `.decode
('unicode_escape')` assumes latin-1 encoding if it encounters non-ascii
bytes: https://bugs.python.org/issue21331. So this gave incorrect
results if non-ascii utf8 data was present in the mapping.
To fix this, we now add an extra layer of `.decode('utf8').encode
('unicode-escape')` in order to convert any non-ascii characters into
their backslash escape sequences. Then the subsequent
`.decode('unicode_escape')` only encounters ascii characters and gives
correct results.
			
			
This commit is contained in:
		| @@ -426,12 +426,20 @@ def load_mapping(name, filename, mapping_is_raw): | ||||
|       return None | ||||
|     return (m.group(1).strip(), m.group(2).strip()) | ||||
|  | ||||
|   def process_unicode_escape_sequences(s): | ||||
|     # Replace unicode escape sequences in the otherwise UTF8-encoded bytestring s with | ||||
|     # the UTF8-encoded characters they represent. We need to do an additional | ||||
|     # .decode('utf8').encode('unicode-escape') to convert any non-ascii characters into | ||||
|     # their escape sequences so that the subsequent .decode('unicode-escape') succeeds: | ||||
|     return s.decode('utf8').encode('unicode-escape').decode('unicode-escape').encode('utf8') | ||||
|  | ||||
|   def parse_quoted_line(line): | ||||
|     m=quoted_regexp.match(line) | ||||
|     if m==None: | ||||
|       return None | ||||
|     return (m.group(1).decode('unicode_escape').encode('utf8'), | ||||
|             m.group(5).decode('unicode_escape').encode('utf8')) | ||||
|       return  | ||||
|      | ||||
|     return (process_unicode_escape_sequences(m.group(1)), | ||||
|             process_unicode_escape_sequences(m.group(5))) | ||||
|  | ||||
|   cache={} | ||||
|   if not os.path.exists(filename): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user