Support Python 3

Port hg-fast-import to Python 2/3 polyglot code.

Since mercurial accepts and returns bytestrings for all repository data,
the approach I've taken here is to use bytestrings throughout the
hg-fast-import code. All strings pertaining to repository data are
bytestrings. This means the code is using the same string datatype for
this data on Python 3 as it did (and still does) on Python 2.

Repository data coming from subprocess calls to git, or read from files,
is also left as the bytestrings either returned from
subprocess.check_output or as read from the file in 'rb' mode.

Regexes and string literals that are used with repository data have
all had a b'' prefix added.

When repository data is used in error/warning messages, it is decoded
with the UTF8 codec for printing.

With this patch, hg-fast-export.py writes binary output to
sys.stdout.buffer on Python 3 - on Python 2 this doesn't exist and it
still uses sys.stdout.

The only strings that are left as "native" strings and not coerced to
bytestrings are filepaths passed in on the command line, and dictionary
keys for internal data structures used by hg-fast-import.py, that do
not originate in repository data.

Mapping files are read in 'rb' mode, and thus bytestrings are read from
them. When an encoding is given, their contents are decoded with that
encoding, but then immediately encoded again with UTF8 and they are
returned as the resulting bytestrings

Other necessary changes were:

 - indexing byestrings with a single index returns an integer on Python.
   These indexing operations have been replaced with a one-element
   slice: x[0] -> x[0:1] or x[-1] -> [-1:] so at to return a bytestring.

 - raw_hash.encode('hex_codec') replaced with binascii.hexlify(raw_hash)

 - str(integer) -> b'%d' % integer

 - 'string_escape' codec replaced with 'unicode_escape' (which was
    backported to python 2.7). Strings decoded with this codec were then
    immediately re-encoded with UTF8.

 - Calls to map() intended to execute their contents immediately were
   unwrapped or converted to list comprehensions, since map() is an
   iterator and does not execute until iterated over.

hg-fast-export.sh has been modified to not require Python 2. Instead, if
PYTHON has not been defined, it checks python2, python, then python3,
and uses the first one that exists and can import the mercurial module.
This commit is contained in:
chrisjbillington
2020-02-10 21:39:13 -05:00
parent 595587b245
commit b961f146df
10 changed files with 252 additions and 176 deletions

View File

@@ -12,14 +12,21 @@ import os
import sys
import subprocess
PY2 = sys.version_info.major < 3
if PY2:
str = unicode
fsencode = lambda s: s.encode(sys.getfilesystemencoding())
else:
from os import fsencode
# default git branch name
cfg_master='master'
cfg_master=b'master'
# default origin name
origin_name=''
origin_name=b''
# silly regex to see if user field has email address
user_re=re.compile('([^<]+) (<[^>]*>)$')
user_re=re.compile(b'([^<]+) (<[^>]*>)$')
# silly regex to clean out user names
user_clean_re=re.compile('^["]([^"]+)["]$')
user_clean_re=re.compile(b'^["]([^"]+)["]$')
def set_default_branch(name):
global cfg_master
@@ -34,26 +41,26 @@ def setup_repo(url):
myui=ui.ui(interactive=False)
except TypeError:
myui=ui.ui()
myui.setconfig('ui', 'interactive', 'off')
myui.setconfig(b'ui', b'interactive', b'off')
# Avoids a warning when the repository has obsolete markers
myui.setconfig('experimental', 'evolution.createmarkers', True)
return myui,hg.repository(myui,url).unfiltered()
myui.setconfig(b'experimental', b'evolution.createmarkers', True)
return myui,hg.repository(myui, fsencode(url)).unfiltered()
def fixup_user(user,authors):
user=user.strip("\"")
user=user.strip(b"\"")
if authors!=None:
# if we have an authors table, try to get mapping
# by defaulting to the current value of 'user'
user=authors.get(user,user)
name,mail,m='','',user_re.match(user)
name,mail,m=b'',b'',user_re.match(user)
if m==None:
# if we don't have 'Name <mail>' syntax, extract name
# and mail from hg helpers. this seems to work pretty well.
# if email doesn't contain @, replace it with devnull@localhost
name=templatefilters.person(user)
mail='<%s>' % templatefilters.email(user)
if '@' not in mail:
mail = '<devnull@localhost>'
mail=b'<%s>' % templatefilters.email(user)
if b'@' not in mail:
mail = b'<devnull@localhost>'
else:
# if we have 'Name <mail>' syntax, everything is fine :)
name,mail=m.group(1),m.group(2)
@@ -62,15 +69,15 @@ def fixup_user(user,authors):
m2=user_clean_re.match(name)
if m2!=None:
name=m2.group(1)
return '%s %s' % (name,mail)
return b'%s %s' % (name,mail)
def get_branch(name):
# 'HEAD' is the result of a bug in mutt's cvs->hg conversion,
# other CVS imports may need it, too
if name=='HEAD' or name=='default' or name=='':
if name==b'HEAD' or name==b'default' or name==b'':
name=cfg_master
if origin_name:
return origin_name + '/' + name
return origin_name + b'/' + name
return name
def get_changeset(ui,repo,revision,authors={},encoding=''):
@@ -79,16 +86,16 @@ def get_changeset(ui,repo,revision,authors={},encoding=''):
# how it fails
try:
node=repo.lookup(revision)
except hgerror.ProgrammingError:
node=binnode(revsymbol(repo,str(revision))) # We were given a numeric rev
except (TypeError, hgerror.ProgrammingError):
node=binnode(revsymbol(repo, b"%d" % revision)) # We were given a numeric rev
except hgerror.RepoLookupError:
node=revision # We got a raw hash
(manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
if encoding:
user=user.decode(encoding).encode('utf8')
desc=desc.decode(encoding).encode('utf8')
tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
branch=get_branch(extra.get('branch','master'))
tz=b"%+03d%02d" % (-timezone // 3600, ((-timezone % 3600) // 60))
branch=get_branch(extra.get(b'branch', b'master'))
return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)
def mangle_key(key):
@@ -98,28 +105,33 @@ def load_cache(filename,get_key=mangle_key):
cache={}
if not os.path.exists(filename):
return cache
f=open(filename,'r')
f=open(filename,'rb')
l=0
for line in f.readlines():
l+=1
fields=line.split(' ')
if fields==None or not len(fields)==2 or fields[0][0]!=':':
fields=line.split(b' ')
if fields==None or not len(fields)==2 or fields[0][0:1]!=b':':
sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
continue
# put key:value in cache, key without ^:
cache[get_key(fields[0][1:])]=fields[1].split('\n')[0]
cache[get_key(fields[0][1:])]=fields[1].split(b'\n')[0]
f.close()
return cache
def save_cache(filename,cache):
f=open(filename,'w+')
map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
f=open(filename,'wb')
for key, value in cache.items():
if not isinstance(key, bytes):
key = str(key).encode('utf8')
if not isinstance(value, bytes):
value = str(value).encode('utf8')
f.write(b':%s %s\n' % (key, value))
f.close()
def get_git_sha1(name,type='heads'):
try:
# use git-rev-parse to support packed refs
ref="refs/%s/%s" % (type,name)
ref="refs/%s/%s" % (type,name.decode('utf8'))
l=subprocess.check_output(["git", "rev-parse", "--verify", "--quiet", ref])
if l == None or len(l) == 0:
return None