Add option for specifying the text encoding used by Mercurial

When a mercurial repository does not use utf-8 for encoding author
strings and commit messages the "-e <encoding>" command line option
can be used to force fast-export to convert incoming meta data from
<encoding> to utf-8.

When "-e <encoding>" is given, we use Python's string
decoding/encoding API to convert meta data on the fly when processing
commits.
This commit is contained in:
zed
2014-10-25 13:18:41 +03:00
committed by Frej Drejhammar
parent f64c10ba14
commit e87c9cb3b8
4 changed files with 26 additions and 8 deletions

5
README
View File

@@ -34,6 +34,11 @@ hg-fast-export'ed from mercurial:
will give hints on which branches need adjustment for starting over will give hints on which branches need adjustment for starting over
again. again.
When a mercurial repository does not use utf-8 for encoding author
strings and commit messages the "-e <encoding>" command line option
can be used to force fast-export to convert incoming meta data from
<encoding> to utf-8.
As mercurial appears to be much less picky about the syntax of the As mercurial appears to be much less picky about the syntax of the
author information than git, an author mapping file can be given to author information than git, an author mapping file can be given to
hg-fast-export to fix up malformed author strings. The file is hg-fast-export to fix up malformed author strings. The file is

View File

@@ -159,7 +159,7 @@ def sanitize_name(name,what="branch"):
sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n)) sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
return n return n
def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,notes): def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,notes,encoding=''):
def get_branchname(name): def get_branchname(name):
if brmap.has_key(name): if brmap.has_key(name):
return brmap[name] return brmap[name]
@@ -167,7 +167,7 @@ def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,
brmap[name]=n brmap[name]=n
return n return n
(revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
branch=get_branchname(branch) branch=get_branchname(branch)
@@ -323,7 +323,7 @@ def verify_heads(ui,repo,cache,force):
return True return True
def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,notes=False): def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,notes=False,encoding=''):
_max=int(m) _max=int(m)
old_marks=load_cache(marksfile,lambda s: int(s)-1) old_marks=load_cache(marksfile,lambda s: int(s)-1)
@@ -354,7 +354,7 @@ def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=Fals
c=0 c=0
brmap={} brmap={}
for rev in range(min,max): for rev in range(min,max):
c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,notes) c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,notes,encoding)
state_cache['tip']=max state_cache['tip']=max
state_cache['repo']=repourl state_cache['repo']=repourl
@@ -401,6 +401,8 @@ if __name__=='__main__':
help="use <name> as namespace to track upstream") help="use <name> as namespace to track upstream")
parser.add_option("--hg-hash",action="store_true",dest="notes", parser.add_option("--hg-hash",action="store_true",dest="notes",
default=False,help="Annotate commits with the hg hash as git notes in the hg namespace") default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
parser.add_option("-e",dest="encoding",
help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
(options,args)=parser.parse_args() (options,args)=parser.parse_args()
@@ -423,5 +425,11 @@ if __name__=='__main__':
if options.origin_name!=None: if options.origin_name!=None:
set_origin_name(options.origin_name) set_origin_name(options.origin_name)
sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile, encoding=''
options.statusfile,authors=a,sob=options.sob,force=options.force,hgtags=options.hgtags,notes=options.notes)) if options.encoding!=None:
encoding=options.encoding
sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
options.headsfile, options.statusfile,authors=a,
sob=options.sob,force=options.force,hgtags=options.hgtags,
notes=options.notes,encoding=encoding))

View File

@@ -13,7 +13,7 @@ SFX_STATE="state"
GFI_OPTS="" GFI_OPTS=""
PYTHON=${PYTHON:-python} PYTHON=${PYTHON:-python}
USAGE="[--quiet] [-r <repo>] [--force] [-m <max>] [-s] [--hgtags] [-A <file>] [-M <name>] [-o <name>] [--hg-hash]" USAGE="[--quiet] [-r <repo>] [--force] [-m <max>] [-s] [--hgtags] [-A <file>] [-M <name>] [-o <name>] [--hg-hash] [-e <encoding>]"
LONG_USAGE="Import hg repository <repo> up to either tip or <max> LONG_USAGE="Import hg repository <repo> up to either tip or <max>
If <repo> is omitted, use last hg repository as obtained from state file, If <repo> is omitted, use last hg repository as obtained from state file,
GIT_DIR/$PFX-$SFX_STATE by default. GIT_DIR/$PFX-$SFX_STATE by default.
@@ -34,6 +34,8 @@ Options:
-o <name> Use <name> as branch namespace to track upstream (eg 'origin') -o <name> Use <name> as branch namespace to track upstream (eg 'origin')
--hg-hash Annotate commits with the hg hash as git notes in the --hg-hash Annotate commits with the hg hash as git notes in the
hg namespace. hg namespace.
-e <encoding> Assume commit and author strings retrieved from
Mercurial are encoded in <encoding>
" "
case "$1" in case "$1" in
-h|--help) -h|--help)

View File

@@ -67,9 +67,12 @@ def get_branch(name):
return origin_name + '/' + name return origin_name + '/' + name
return name return name
def get_changeset(ui,repo,revision,authors={}): def get_changeset(ui,repo,revision,authors={},encoding=''):
node=repo.lookup(revision) node=repo.lookup(revision)
(manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
if encoding:
user=user.decode(encoding).encode('utf8')
desc=desc.decode(encoding).encode('utf8')
tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
branch=get_branch(extra.get('branch','master')) branch=get_branch(extra.get('branch','master'))
return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra) return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)