Files
Fast-Export/hg2git.py
Rocco Rutte 85f0d9c881 hg2git.py: Refactor main code into hg2git() function
Now this can even be used as a module from other python scripts by
simply calling the hg2git() function.

Except some config values nobody really ever wants to change, it's even
save to run several hg2git() functions in parallel as no global vars or
the like are used by intention (but it makes the code uglier).

Signed-off-by: Rocco Rutte <pdmef@gmx.net>
2007-03-08 11:21:21 +00:00

279 lines
8.1 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
# License: GPLv2
"""hg2git.py - A mercurial-to-git filter for git-fast-import(1)
Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
"""
from mercurial import repo,hg,cmdutil,util,ui,revlog,node
from tempfile import mkstemp
import re
import sys
import os
# silly regex to see if user field has email address
user_re=re.compile('[^<]+ <[^>]+>$')
# git branch for hg's default 'HEAD' branch
cfg_master='master'
# insert 'checkpoint' command after this many commits or none at all if 0
cfg_checkpoint_count=0
def usage(ret):
sys.stderr.write(__doc__)
return ret
def setup_repo(url):
myui=ui.ui()
return myui,hg.repository(myui,url)
def get_changeset(ui,repo,revision):
def get_branch(name):
if name=='HEAD':
name=cfg_master
return name
def fixup_user(user):
if user_re.match(user)==None:
if '@' not in user:
return user+' <none@none>'
return user+' <'+user+'>'
return user
node=repo.lookup(revision)
(manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
branch=get_branch(extra.get('branch','master'))
return (manifest,fixup_user(user),(time,tz),files,desc,branch,extra)
def gitmode(x):
return x and '100755' or '100644'
def wr(msg=''):
print msg
#map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
def checkpoint(count):
count=count+1
if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
sys.stderr.write("Checkpoint after %d commits\n" % count)
wr('checkpoint')
wr()
return count
def get_parent_mark(parent,marks):
"""Get the mark for some parent.
If we saw it in the current session, return :%d syntax and
otherwise the SHA1 from the cache."""
return marks.get(str(parent+1),':%d' % (parent+1))
def mismatch(f1,f2):
"""See if two revisions of a file are not equal."""
return node.hex(f1)!=node.hex(f2)
def outer_set(dleft,dright,l,c,r):
"""Loop over our repository and find all changed and missing files."""
for left in dleft.keys():
right=dright.get(left,None)
if right==None:
# we have the file but our parent hasn't: add to left set
l.append(left)
elif mismatch(dleft[left],right):
# we have it but checksums mismatch: add to center set
c.append(left)
for right in dright.keys():
left=dleft.get(right,None)
if left==None:
# if parent has file but we don't: add to right set
r.append(right)
# change is already handled when comparing child against parent
return l,c,r
def get_filechanges(repo,revision,parents,mleft):
"""Given some repository and revision, find all changed/deleted files."""
l,c,r=[],[],[]
for p in parents:
if p<0: continue
mright=repo.changectx(p).manifest()
dleft=mleft.keys()
dleft.sort()
dright=mright.keys()
dright.sort()
l,c,r=outer_set(mleft,mright,l,c,r)
return l,c,r
def export_commit(ui,repo,revision,marks,heads,last,max,count):
(_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision)
parents=repo.changelog.parentrevs(revision)
wr('commit refs/heads/%s' % branch)
wr('mark :%d' % (revision+1))
wr('committer %s %d %s' % (user,time,timezone))
wr('data %d' % (len(desc)+1)) # wtf?
wr(desc)
wr()
src=heads.get(branch,'')
link=''
if src!='':
# if we have a cached head, this is an incremental import: initialize it
# and kill reference so we won't init it again
wr('from %s' % src)
heads[branch]=''
sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
(branch,src))
link=src # avoid making a merge commit for incremental import
elif link=='' and not heads.has_key(branch) and revision>0:
# newly created branch and not the first one: connect to parent
tmp=get_parent_mark(parents[0],marks)
wr('from %s' % tmp)
sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
(branch,tmp))
link=tmp # avoid making a merge commit for branch fork
if parents:
l=last.get(branch,revision)
for p in parents:
# 1) as this commit implicitely is the child of the most recent
# commit of this branch, ignore this parent
# 2) ignore nonexistent parents
# 3) merge otherwise
if p==l or p==revision or p<0:
continue
tmp=get_parent_mark(p,marks)
# if we fork off a branch, don't merge with our parent via 'merge'
# as we have 'from' already above
if tmp==link:
continue
sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
(branch,tmp,p))
wr('merge %s' % tmp)
last[branch]=revision
heads[branch]=''
# we need this later to write out tags
marks[str(revision)]=':%d'%(revision+1)
ctx=repo.changectx(str(revision))
man=ctx.manifest()
added,changed,removed=get_filechanges(repo,revision,parents,man)
sys.stderr.write('Exporting revision %d with %d/%d/%d added/changed/removed files\n' %
(revision,len(added),len(changed),len(removed)))
for a in added+changed:
fctx=ctx.filectx(a)
d=fctx.data()
wr('M %s inline %s' % (gitmode(man.execf(a)),a))
wr('data %d' % len(d)) # had some trouble with size()
wr(d)
for r in removed:
wr('D %s' % r)
wr()
return checkpoint(count)
def export_tags(ui,repo,marks_cache,start,end,count):
l=repo.tagslist()
for tag,node in l:
# ignore latest revision
if tag=='tip': continue
rev=repo.changelog.rev(node)
# ignore those tags not in our import range
if rev<start or rev>=end: continue
ref=marks_cache.get(str(rev),None)
if ref==None:
sys.stderr.write('Failed to find reference for creating tag'
' %s at r%d\n' % (tag,rev))
continue
(_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev)
sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
wr('tag %s' % tag)
wr('from %s' % ref)
wr('tagger %s %d %s' % (user,time,timezone))
msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag,
rev,branch,desc.split('\n')[0])
wr('data %d' % (len(msg)+1))
wr(msg)
wr()
count=checkpoint(count)
return count
def load_cache(filename):
cache={}
if not os.path.exists(filename):
return cache
f=open(filename,'r')
l=0
for line in f.readlines():
l+=1
fields=line.split(' ')
if fields==None or not len(fields)==2 or fields[0][0]!=':':
sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
continue
# put key:value in cache, key without ^:
cache[fields[0][1:]]=fields[1].split('\n')[0]
f.close()
return cache
def save_cache(filename,cache):
f=open(filename,'w+')
map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
f.close()
def verify_heads(ui,repo,cache):
def getsha1(branch):
f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
sha1=f.readlines()[0].split('\n')[0]
f.close()
return sha1
for b in cache.keys():
sys.stderr.write('Verifying branch [%s]\n' % b)
sha1=getsha1(b)
c=cache.get(b)
if sha1!=c:
sys.stderr.write('Warning: Branch [%s] modified outside hg2git:'
'\n%s (repo) != %s (cache)\n' % (b,sha1,c))
return True
def hg2git(repourl,m,marksfile,headsfile,tipfile):
_max=int(m)
marks_cache=load_cache(marksfile)
heads_cache=load_cache(headsfile)
state_cache=load_cache(tipfile)
ui,repo=setup_repo(repourl)
if not verify_heads(ui,repo,heads_cache):
return 1
tip=repo.changelog.count()
min=int(state_cache.get('tip',0))
max=_max
if _max<0:
max=tip
c=0
last={}
for rev in range(min,max):
c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c)
c=export_tags(ui,repo,marks_cache,min,max,c)
sys.stderr.write('Issued %d commands\n' % c)
state_cache['tip']=max
state_cache['repo']=repourl
save_cache(tipfile,state_cache)
return 0
if __name__=='__main__':
if len(sys.argv)!=6: sys.exit(usage(1))
repourl,m,marksfile,headsfile,tipfile=sys.argv[1:]
sys.exit(hg2git(repourl,m,marksfile,headsfile,tipfile))