mirror of
https://github.com/frej/fast-export.git
synced 2025-11-01 00:45:47 +01:00
Resolve unicode escape sequences not being processed correctly
In `process_unicode_escape_sequences()`, any backslash escape sequences
in the original string are escaped upon the first
`.encode('unicode-escape')` and therefore round-trip the sequence of
`.encode('unicode-escape').decode('unicode-escape')`.
That is not what we want - we want these sequences to be passed-through
the `.encode` unchanged, so that they will be converted to the
character they represent upon `.decode()`.
This patch changes the `.encode()` step to pass through any ascii
characters unchanged, only escaping non-ascii characters. This ensures
any existing backslash escape sequences will be interpreted as the
character they represent upon `.decode()`.
This commit is contained in:
@@ -434,9 +434,15 @@ def load_mapping(name, filename, mapping_is_raw):
|
||||
def process_unicode_escape_sequences(s):
|
||||
# Replace unicode escape sequences in the otherwise UTF8-encoded bytestring s with
|
||||
# the UTF8-encoded characters they represent. We need to do an additional
|
||||
# .decode('utf8').encode('unicode-escape') to convert any non-ascii characters into
|
||||
# their escape sequences so that the subsequent .decode('unicode-escape') succeeds:
|
||||
return s.decode('utf8').encode('unicode-escape').decode('unicode-escape').encode('utf8')
|
||||
# .decode('utf8').encode('ascii', 'backslashreplace') to convert any non-ascii
|
||||
# characters into their escape sequences so that the subsequent
|
||||
# .decode('unicode-escape') succeeds:
|
||||
return (
|
||||
s.decode('utf8')
|
||||
.encode('ascii', 'backslashreplace')
|
||||
.decode('unicode-escape')
|
||||
.encode('utf8')
|
||||
)
|
||||
|
||||
def parse_quoted_line(line):
|
||||
m=quoted_regexp.match(line)
|
||||
|
||||
Reference in New Issue
Block a user