Resolve unicode escape sequences not being processed correctly

In `process_unicode_escape_sequences()`, any backslash escape sequences in the original string are escaped upon the first `.encode('unicode-escape')` and therefore round-trip the sequence of `.encode('unicode-escape').decode('unicode-escape')`. That is not what we want - we want these sequences to be passed-through the `.encode` unchanged, so that they will be converted to the character they represent upon `.decode()`. This patch changes the `.encode()` step to pass through any ascii characters unchanged, only escaping non-ascii characters. This ensures any existing backslash escape sequences will be interpreted as the character they represent upon `.decode()`.
2025-10-31 08:35:46 +01:00 · 2022-10-23 11:51:33 +11:00
parent 667404e836
commit 13c273f10c
1 changed files with 9 additions and 3 deletions
--- a/hg-fast-export.py
+++ b/hg-fast-export.py
@@ -434,9 +434,15 @@ def load_mapping(name, filename, mapping_is_raw):
  def process_unicode_escape_sequences(s):
    # Replace unicode escape sequences in the otherwise UTF8-encoded bytestring s with
    # the UTF8-encoded characters they represent. We need to do an additional
-    # .decode('utf8').encode('unicode-escape') to convert any non-ascii characters into
-    # their escape sequences so that the subsequent .decode('unicode-escape') succeeds:
-    return s.decode('utf8').encode('unicode-escape').decode('unicode-escape').encode('utf8')
+    # .decode('utf8').encode('ascii', 'backslashreplace') to convert any non-ascii
+    # characters into their escape sequences so that the subsequent
+    # .decode('unicode-escape') succeeds:
+    return (
+      s.decode('utf8')
+      .encode('ascii', 'backslashreplace')
+      .decode('unicode-escape')
+      .encode('utf8')
+    )

  def parse_quoted_line(line):
    m=quoted_regexp.match(line)