mirror of
https://github.com/frej/fast-export.git
synced 2025-11-01 08:55:47 +01:00
In the git repo there may be any number branches that are not hg imported branches, so it doesn't make sense to print warnings when a non-hg head isn't at what it was last time. Now we get a list of branchtags hg has and only verify these. Signed-off-by: Rocco Rutte <pdmef@gmx.net>
347 lines
10 KiB
Python
347 lines
10 KiB
Python
#!/usr/bin/env python
|
|
|
|
# Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
|
|
# License: GPLv2
|
|
|
|
"""hg2git.py - A mercurial-to-git filter for git-fast-import(1)
|
|
Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
|
|
"""
|
|
|
|
from mercurial import repo,hg,cmdutil,util,ui,revlog,node
|
|
from tempfile import mkstemp
|
|
import re
|
|
import sys
|
|
import os
|
|
|
|
# silly regex to catch Signed-off-by lines in log message
|
|
sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
|
|
# silly regex to see if user field has email address
|
|
user_re=re.compile('([^<]+) (<[^>]+>)$')
|
|
# silly regex to clean out user names
|
|
user_clean_re=re.compile('^["]([^"]+)["]$')
|
|
# git branch for hg's default 'HEAD' branch
|
|
cfg_master='master'
|
|
# insert 'checkpoint' command after this many commits or none at all if 0
|
|
cfg_checkpoint_count=0
|
|
|
|
def usage(ret):
|
|
sys.stderr.write(__doc__)
|
|
return ret
|
|
|
|
def setup_repo(url):
|
|
myui=ui.ui()
|
|
return myui,hg.repository(myui,url)
|
|
|
|
def fixup_user(user,authors):
|
|
if authors!=None:
|
|
# if we have an authors table, try to get mapping
|
|
# by defaulting to the current value of 'user'
|
|
user=authors.get(user,user)
|
|
name,mail,m='','',user_re.match(user)
|
|
if m==None:
|
|
# if we don't have 'Name <mail>' syntax, use 'user
|
|
# <devnull@localhost>' if use contains no at and
|
|
# 'user <user>' otherwise
|
|
name=user
|
|
if '@' not in user:
|
|
mail='<devnull@localhost>'
|
|
else:
|
|
mail='<%s>' % user
|
|
else:
|
|
# if we have 'Name <mail>' syntax, everything is fine :)
|
|
name,mail=m.group(1),m.group(2)
|
|
|
|
# remove any silly quoting from username
|
|
m2=user_clean_re.match(name)
|
|
if m2!=None:
|
|
name=m2.group(1)
|
|
return '%s %s' % (name,mail)
|
|
|
|
def get_branch(name):
|
|
if name=='HEAD':
|
|
name=cfg_master
|
|
return name
|
|
|
|
def get_changeset(ui,repo,revision,authors):
|
|
node=repo.lookup(revision)
|
|
(manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
|
|
tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
|
|
branch=get_branch(extra.get('branch','master'))
|
|
return (manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)
|
|
|
|
def gitmode(x):
|
|
return x and '100755' or '100644'
|
|
|
|
def wr(msg=''):
|
|
print msg
|
|
#map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
|
|
|
|
def checkpoint(count):
|
|
count=count+1
|
|
if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
|
|
sys.stderr.write("Checkpoint after %d commits\n" % count)
|
|
wr('checkpoint')
|
|
wr()
|
|
return count
|
|
|
|
def get_parent_mark(parent,marks):
|
|
"""Get the mark for some parent.
|
|
If we saw it in the current session, return :%d syntax and
|
|
otherwise the SHA1 from the cache."""
|
|
return marks.get(str(parent+1),':%d' % (parent+1))
|
|
|
|
def mismatch(f1,f2):
|
|
"""See if two revisions of a file are not equal."""
|
|
return node.hex(f1)!=node.hex(f2)
|
|
|
|
def outer_set(dleft,dright,l,c,r):
|
|
"""Loop over our repository and find all changed and missing files."""
|
|
for left in dleft.keys():
|
|
right=dright.get(left,None)
|
|
if right==None:
|
|
# we have the file but our parent hasn't: add to left set
|
|
l.append(left)
|
|
elif mismatch(dleft[left],right):
|
|
# we have it but checksums mismatch: add to center set
|
|
c.append(left)
|
|
for right in dright.keys():
|
|
left=dleft.get(right,None)
|
|
if left==None:
|
|
# if parent has file but we don't: add to right set
|
|
r.append(right)
|
|
# change is already handled when comparing child against parent
|
|
return l,c,r
|
|
|
|
def get_filechanges(repo,revision,parents,mleft):
|
|
"""Given some repository and revision, find all changed/deleted files."""
|
|
l,c,r=[],[],[]
|
|
for p in parents:
|
|
if p<0: continue
|
|
mright=repo.changectx(p).manifest()
|
|
dleft=mleft.keys()
|
|
dleft.sort()
|
|
dright=mright.keys()
|
|
dright.sort()
|
|
l,c,r=outer_set(mleft,mright,l,c,r)
|
|
return l,c,r
|
|
|
|
def get_author(logmessage,committer,authors):
|
|
"""As git distincts between author and committer of a patch, try to
|
|
extract author by detecting Signed-off-by lines.
|
|
|
|
This walks from the end of the log message towards the top skipping
|
|
empty lines. Upon the first non-empty line, it walks all Signed-off-by
|
|
lines upwards to find the first one. For that (if found), it extracts
|
|
authorship information the usual way (authors table, cleaning, etc.)
|
|
|
|
If no Signed-off-by line is found, this defaults to the committer.
|
|
|
|
This may sound stupid (and it somehow is), but in log messages we
|
|
accidentially may have lines in the middle starting with
|
|
"Signed-off-by: foo" and thus matching our detection regex. Prevent
|
|
that."""
|
|
|
|
loglines=logmessage.split('\n')
|
|
i=len(loglines)
|
|
# from tail walk to top skipping empty lines
|
|
while i>=0:
|
|
i-=1
|
|
if len(loglines[i].strip())==0: continue
|
|
break
|
|
if i>=0:
|
|
# walk further upwards to find first sob line, store in 'first'
|
|
first=None
|
|
while i>=0:
|
|
m=sob_re.match(loglines[i])
|
|
if m==None: break
|
|
first=m
|
|
i-=1
|
|
# if the last non-empty line matches our Signed-Off-by regex: extract username
|
|
if first!=None:
|
|
r=fixup_user(first.group(1),authors)
|
|
return r
|
|
return committer
|
|
|
|
def export_commit(ui,repo,revision,marks,heads,last,max,count,authors):
|
|
(_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
|
|
parents=repo.changelog.parentrevs(revision)
|
|
|
|
wr('commit refs/heads/%s' % branch)
|
|
wr('mark :%d' % (revision+1))
|
|
wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
|
|
wr('committer %s %d %s' % (user,time,timezone))
|
|
wr('data %d' % (len(desc)+1)) # wtf?
|
|
wr(desc)
|
|
wr()
|
|
|
|
src=heads.get(branch,'')
|
|
link=''
|
|
if src!='':
|
|
# if we have a cached head, this is an incremental import: initialize it
|
|
# and kill reference so we won't init it again
|
|
wr('from %s' % src)
|
|
heads[branch]=''
|
|
sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
|
|
(branch,src))
|
|
link=src # avoid making a merge commit for incremental import
|
|
elif link=='' and not heads.has_key(branch) and revision>0:
|
|
# newly created branch and not the first one: connect to parent
|
|
tmp=get_parent_mark(parents[0],marks)
|
|
wr('from %s' % tmp)
|
|
sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
|
|
(branch,tmp))
|
|
link=tmp # avoid making a merge commit for branch fork
|
|
|
|
if parents:
|
|
l=last.get(branch,revision)
|
|
for p in parents:
|
|
# 1) as this commit implicitely is the child of the most recent
|
|
# commit of this branch, ignore this parent
|
|
# 2) ignore nonexistent parents
|
|
# 3) merge otherwise
|
|
if p==l or p==revision or p<0:
|
|
continue
|
|
tmp=get_parent_mark(p,marks)
|
|
# if we fork off a branch, don't merge with our parent via 'merge'
|
|
# as we have 'from' already above
|
|
if tmp==link:
|
|
continue
|
|
sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
|
|
(branch,tmp,p))
|
|
wr('merge %s' % tmp)
|
|
|
|
last[branch]=revision
|
|
heads[branch]=''
|
|
# we need this later to write out tags
|
|
marks[str(revision)]=':%d'%(revision+1)
|
|
|
|
ctx=repo.changectx(str(revision))
|
|
man=ctx.manifest()
|
|
added,changed,removed=get_filechanges(repo,revision,parents,man)
|
|
|
|
sys.stderr.write('Exporting revision %d with %d/%d/%d added/changed/removed files\n' %
|
|
(revision,len(added),len(changed),len(removed)))
|
|
|
|
for a in added+changed:
|
|
fctx=ctx.filectx(a)
|
|
d=fctx.data()
|
|
wr('M %s inline %s' % (gitmode(man.execf(a)),a))
|
|
wr('data %d' % len(d)) # had some trouble with size()
|
|
wr(d)
|
|
|
|
for r in removed:
|
|
wr('D %s' % r)
|
|
|
|
wr()
|
|
return checkpoint(count)
|
|
|
|
def export_tags(ui,repo,marks_cache,start,end,count,authors):
|
|
l=repo.tagslist()
|
|
for tag,node in l:
|
|
# ignore latest revision
|
|
if tag=='tip': continue
|
|
rev=repo.changelog.rev(node)
|
|
# ignore those tags not in our import range
|
|
if rev<start or rev>=end: continue
|
|
|
|
ref=marks_cache.get(str(rev),None)
|
|
if ref==None:
|
|
sys.stderr.write('Failed to find reference for creating tag'
|
|
' %s at r%d\n' % (tag,rev))
|
|
continue
|
|
(_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev,authors)
|
|
sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
|
|
wr('tag %s' % tag)
|
|
wr('from %s' % ref)
|
|
wr('tagger %s %d %s' % (user,time,timezone))
|
|
msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag,
|
|
rev,branch,desc.split('\n')[0])
|
|
wr('data %d' % (len(msg)+1))
|
|
wr(msg)
|
|
wr()
|
|
count=checkpoint(count)
|
|
return count
|
|
|
|
def load_cache(filename):
|
|
cache={}
|
|
if not os.path.exists(filename):
|
|
return cache
|
|
f=open(filename,'r')
|
|
l=0
|
|
for line in f.readlines():
|
|
l+=1
|
|
fields=line.split(' ')
|
|
if fields==None or not len(fields)==2 or fields[0][0]!=':':
|
|
sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
|
|
continue
|
|
# put key:value in cache, key without ^:
|
|
cache[fields[0][1:]]=fields[1].split('\n')[0]
|
|
f.close()
|
|
return cache
|
|
|
|
def save_cache(filename,cache):
|
|
f=open(filename,'w+')
|
|
map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
|
|
f.close()
|
|
|
|
def verify_heads(ui,repo,cache):
|
|
def getsha1(branch):
|
|
f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
|
|
sha1=f.readlines()[0].split('\n')[0]
|
|
f.close()
|
|
return sha1
|
|
|
|
# get list of hg's branches to verify, don't take all git has
|
|
branches=repo.branchtags()
|
|
l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
|
|
l.sort()
|
|
|
|
for _,_,b in l:
|
|
b=get_branch(b)
|
|
sys.stderr.write('Verifying branch [%s]\n' % b)
|
|
sha1=getsha1(b)
|
|
c=cache.get(b)
|
|
if sha1!=c:
|
|
sys.stderr.write('Warning: Branch [%s] modified outside hg2git:'
|
|
'\n%s (repo) != %s (cache)\n' % (b,sha1,c))
|
|
return True
|
|
|
|
def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={}):
|
|
_max=int(m)
|
|
|
|
marks_cache=load_cache(marksfile)
|
|
heads_cache=load_cache(headsfile)
|
|
state_cache=load_cache(tipfile)
|
|
|
|
ui,repo=setup_repo(repourl)
|
|
|
|
if not verify_heads(ui,repo,heads_cache):
|
|
return 1
|
|
|
|
tip=repo.changelog.count()
|
|
|
|
min=int(state_cache.get('tip',0))
|
|
max=_max
|
|
if _max<0:
|
|
max=tip
|
|
|
|
c=0
|
|
last={}
|
|
for rev in range(min,max):
|
|
c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c,authors)
|
|
|
|
c=export_tags(ui,repo,marks_cache,min,max,c,authors)
|
|
|
|
sys.stderr.write('Issued %d commands\n' % c)
|
|
|
|
state_cache['tip']=max
|
|
state_cache['repo']=repourl
|
|
save_cache(tipfile,state_cache)
|
|
|
|
return 0
|
|
|
|
if __name__=='__main__':
|
|
if len(sys.argv)!=6: sys.exit(usage(1))
|
|
repourl,m,marksfile,headsfile,tipfile=sys.argv[1:]
|
|
sys.exit(hg2git(repourl,m,marksfile,headsfile,tipfile))
|