mirror of
https://github.com/frej/fast-export.git
synced 2025-11-01 00:45:47 +01:00
Allow utf8 in mappings
We were previously processing entries in mapping files (when
`--mappings-are-raw` is not given) with
`.decode('unicode_escape').encode('utf8')` to replace backslash escape
sequences in bytestrings with the utf-8 encoded characters they
represent. However, it turns out that `.decode
('unicode_escape')` assumes latin-1 encoding if it encounters non-ascii
bytes: https://bugs.python.org/issue21331. So this gave incorrect
results if non-ascii utf8 data was present in the mapping.
To fix this, we now add an extra layer of `.decode('utf8').encode
('unicode-escape')` in order to convert any non-ascii characters into
their backslash escape sequences. Then the subsequent
`.decode('unicode_escape')` only encounters ascii characters and gives
correct results.
This commit is contained in:
@@ -80,10 +80,10 @@ author information than git, an author mapping file can be given to
|
|||||||
hg-fast-export to fix up malformed author strings. The file is
|
hg-fast-export to fix up malformed author strings. The file is
|
||||||
specified using the -A option. The file should contain lines of the
|
specified using the -A option. The file should contain lines of the
|
||||||
form `"<key>"="<value>"`. Inside the key and value strings, all escape
|
form `"<key>"="<value>"`. Inside the key and value strings, all escape
|
||||||
sequences understood by the python `string_escape` encoding are
|
sequences understood by the python `unicode_escape` encoding are
|
||||||
supported. (Versions of fast-export prior to v171002 had a different
|
supported; strings are otherwise assumed to be UTF8-encoded.
|
||||||
syntax, the old syntax can be enabled by the flag
|
(Versions of fast-export prior to v171002 had a different syntax, the
|
||||||
`--mappings-are-raw`.)
|
old syntax can be enabled by the flag `--mappings-are-raw`.)
|
||||||
|
|
||||||
The example authors.map below will translate `User
|
The example authors.map below will translate `User
|
||||||
<garbage<tab><user@example.com>` to `User <user@example.com>`.
|
<garbage<tab><user@example.com>` to `User <user@example.com>`.
|
||||||
|
|||||||
@@ -426,12 +426,20 @@ def load_mapping(name, filename, mapping_is_raw):
|
|||||||
return None
|
return None
|
||||||
return (m.group(1).strip(), m.group(2).strip())
|
return (m.group(1).strip(), m.group(2).strip())
|
||||||
|
|
||||||
|
def process_unicode_escape_sequences(s):
|
||||||
|
# Replace unicode escape sequences in the otherwise UTF8-encoded bytestring s with
|
||||||
|
# the UTF8-encoded characters they represent. We need to do an additional
|
||||||
|
# .decode('utf8').encode('unicode-escape') to convert any non-ascii characters into
|
||||||
|
# their escape sequences so that the subsequent .decode('unicode-escape') succeeds:
|
||||||
|
return s.decode('utf8').encode('unicode-escape').decode('unicode-escape').encode('utf8')
|
||||||
|
|
||||||
def parse_quoted_line(line):
|
def parse_quoted_line(line):
|
||||||
m=quoted_regexp.match(line)
|
m=quoted_regexp.match(line)
|
||||||
if m==None:
|
if m==None:
|
||||||
return None
|
return
|
||||||
return (m.group(1).decode('unicode_escape').encode('utf8'),
|
|
||||||
m.group(5).decode('unicode_escape').encode('utf8'))
|
return (process_unicode_escape_sequences(m.group(1)),
|
||||||
|
process_unicode_escape_sequences(m.group(5)))
|
||||||
|
|
||||||
cache={}
|
cache={}
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
|
|||||||
Reference in New Issue
Block a user