#!/usr/bin/env python # Copyright (c) 2007 Rocco Rutte # License: GPLv2 """hg2git.py - A mercurial-to-git filter for git-fast-import(1) Usage: hg2git.py """ from mercurial import repo,hg,cmdutil,util,ui,revlog,node from tempfile import mkstemp import re import sys import os # silly regex to see if user field has email address user_re=re.compile('[^<]+ <[^>]+>$') # git branch for hg's default 'HEAD' branch cfg_master='master' # insert 'checkpoint' command after this many commits cfg_checkpoint_count=1000 def usage(ret): sys.stderr.write(__doc__) return ret def setup_repo(url): myui=ui.ui() return myui,hg.repository(myui,url) def get_changeset(ui,repo,revision): def get_branch(name): if name=='HEAD': name=cfg_master return name def fixup_user(user): if user_re.match(user)==None: if '@' not in user: return user+' ' return user+' <'+user+'>' return user node=repo.lookup(revision) (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) branch=get_branch(extra.get('branch','master')) return (manifest,fixup_user(user),(time,tz),files,desc,branch,extra) def gitmode(x): return x and '100755' or '100644' def wr(msg=''): print msg #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) def checkpoint(count): count=count+1 if count%cfg_checkpoint_count==0: sys.stderr.write("Checkpoint after %d commits\n" % count) wr('checkpoint') wr() return count def get_parent_mark(parent,marks): """Get the mark for some parent. If we saw it in the current session, return :%d syntax and otherwise the SHA1 from the cache.""" return marks.get(str(parent+1),':%d' % (parent+1)) def mismatch(f1,f2): """See if two revisions of a file are not equal.""" return node.hex(f1)!=node.hex(f2) def outer_set(dleft,dright,l,c,r): """Loop over our repository and find all changed and missing files.""" for left in dleft.keys(): right=dright.get(left,None) if right==None: # we have the file but our parent hasn't: add to left set l.append(left) elif mismatch(dleft[left],right): # we have it but checksums mismatch: add to center set c.append(left) for right in dright.keys(): left=dleft.get(right,None) if left==None: # if parent has file but we don't: add to right set r.append(right) # change is already handled when comparing child against parent return l,c,r def get_filechanges(repo,revision,parents,mleft): """Given some repository and revision, find all changed/deleted files.""" l,c,r=[],[],[] for p in parents: if p<0: continue mright=repo.changectx(p).manifest() dleft=mleft.keys() dleft.sort() dright=mright.keys() dright.sort() l,c,r=outer_set(mleft,mright,l,c,r) return l,c,r def export_commit(ui,repo,revision,marks,heads,last,max,count): (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision) parents=repo.changelog.parentrevs(revision) wr('commit refs/heads/%s' % branch) wr('mark :%d' % (revision+1)) wr('committer %s %d %s' % (user,time,timezone)) wr('data %d' % (len(desc)+1)) # wtf? wr(desc) wr() src=heads.get(branch,'') link='' if src!='': # if we have a cached head, this is an incremental import: initialize it # and kill reference so we won't init it again wr('from %s' % src) heads[branch]='' sys.stderr.write('Initializing branch [%s] to parent [%s]\n' % (branch,src)) link=src # avoid making a merge commit for incremental import elif link=='' and not heads.has_key(branch) and revision>0: # newly created branch and not the first one: connect to parent tmp=get_parent_mark(parents[0],marks) wr('from %s' % tmp) sys.stderr.write('Link new branch [%s] to parent [%s]\n' % (branch,tmp)) link=tmp # avoid making a merge commit for branch fork if parents: l=last.get(branch,revision) for p in parents: # 1) as this commit implicitely is the child of the most recent # commit of this branch, ignore this parent # 2) ignore nonexistent parents # 3) merge otherwise if p==l or p==revision or p<0: continue tmp=get_parent_mark(p,marks) # if we fork off a branch, don't merge with our parent via 'merge' # as we have 'from' already above if tmp==link: continue sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' % (branch,tmp,p)) wr('merge %s' % tmp) last[branch]=revision heads[branch]='' # we need this later to write out tags marks[str(revision)]=':%d'%(revision+1) ctx=repo.changectx(str(revision)) man=ctx.manifest() added,changed,removed=get_filechanges(repo,revision,parents,man) sys.stderr.write('Exporting revision %d with %d/%d/%d added/changed/removed files\n' % (revision,len(added),len(changed),len(removed))) for a in added+changed: fctx=ctx.filectx(a) d=fctx.data() wr('M %s inline %s' % (gitmode(man.execf(a)),a)) wr('data %d' % len(d)) # had some trouble with size() wr(d) for r in removed: wr('D %s' % r) wr() return checkpoint(count) def export_tags(ui,repo,marks_cache,start,end,count): l=repo.tagslist() for tag,node in l: # ignore latest revision if tag=='tip': continue rev=repo.changelog.rev(node) # ignore those tags not in our import range if rev=end: continue ref=marks_cache.get(str(rev),None) if ref==None: sys.stderr.write('Failed to find reference for creating tag' ' %s at r%d\n' % (tag,rev)) continue (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev) sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) wr('tag %s' % tag) wr('from %s' % ref) wr('tagger %s %d %s' % (user,time,timezone)) msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag, rev,branch,desc.split('\n')[0]) wr('data %d' % (len(msg)+1)) wr(msg) wr() count=checkpoint(count) return count def load_cache(filename): cache={} if not os.path.exists(filename): return cache f=open(filename,'r') l=0 for line in f.readlines(): l+=1 fields=line.split(' ') if fields==None or not len(fields)==2 or fields[0][0]!=':': sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) continue # put key:value in cache, key without ^: cache[fields[0][1:]]=fields[1].split('\n')[0] f.close() return cache def save_cache(filename,cache): f=open(filename,'w+') map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys()) f.close() def verify_heads(ui,repo,cache): def getsha1(branch): f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch) sha1=f.readlines()[0].split('\n')[0] f.close() return sha1 for b in cache.keys(): sys.stderr.write('Verifying branch [%s]\n' % b) sha1=getsha1(b) c=cache.get(b) if sha1!=c: sys.stderr.write('Warning: Branch [%s] modified outside hg2git:' '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) return True if __name__=='__main__': if len(sys.argv)!=6: sys.exit(usage(1)) repourl,m,marksfile,headsfile,tipfile=sys.argv[1:] _max=int(m) marks_cache=load_cache(marksfile) heads_cache=load_cache(headsfile) state_cache=load_cache(tipfile) ui,repo=setup_repo(repourl) if not verify_heads(ui,repo,heads_cache): sys.exit(1) tip=repo.changelog.count() min=int(state_cache.get('tip',0)) max=_max if _max<0: max=tip c=0 last={} for rev in range(min,max): c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c) c=export_tags(ui,repo,marks_cache,min,max,c) sys.stderr.write('Issued %d commands\n' % c) state_cache['tip']=max state_cache['repo']=repourl save_cache(tipfile,state_cache)