Mercurial > hg > soundsoftware-site
diff extra/fast-export/hg-fast-export.py @ 1544:e9e55585ebf2 feature_1136
Add fast-export
author | Chris Cannam <chris.cannam@soundsoftware.ac.uk> |
---|---|
date | Tue, 12 Jan 2016 13:39:30 +0000 |
parents | |
children | 3ad53f43483d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extra/fast-export/hg-fast-export.py Tue Jan 12 13:39:30 2016 +0000 @@ -0,0 +1,474 @@ +#!/usr/bin/env python + +# Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others. +# License: MIT <http://www.opensource.org/licenses/mit-license.php> + +from mercurial import node +from hg2git import setup_repo,fixup_user,get_branch,get_changeset +from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name +from optparse import OptionParser +import re +import sys +import os + +if sys.platform == "win32": + # On Windows, sys.stdout is initially opened in text mode, which means that + # when a LF (\n) character is written to sys.stdout, it will be converted + # into CRLF (\r\n). That makes git blow up, so use this platform-specific + # code to change the mode of sys.stdout to binary. + import msvcrt + msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) + +# silly regex to catch Signed-off-by lines in log message +sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$') +# insert 'checkpoint' command after this many commits or none at all if 0 +cfg_checkpoint_count=0 +# write some progress message every this many file contents written +cfg_export_boundary=1000 + +def gitmode(flags): + return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644' + +def wr_no_nl(msg=''): + if msg: + sys.stdout.write(msg) + +def wr(msg=''): + wr_no_nl(msg) + sys.stdout.write('\n') + #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) + +def checkpoint(count): + count=count+1 + if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0: + sys.stderr.write("Checkpoint after %d commits\n" % count) + wr('checkpoint') + wr() + return count + +def revnum_to_revref(rev, old_marks): + """Convert an hg revnum to a git-fast-import rev reference (an SHA1 + or a mark)""" + return old_marks.get(rev) or ':%d' % (rev+1) + +def file_mismatch(f1,f2): + """See if two revisions of a file are not equal.""" + return node.hex(f1)!=node.hex(f2) + +def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch): + """Loop over our repository and find all changed and missing files.""" + for left in dleft.keys(): + right=dright.get(left,None) + if right==None: + # we have the file but our parent hasn't: add to left set + l.append(left) + elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)): + # we have it but checksums mismatch: add to center set + c.append(left) + for right in dright.keys(): + left=dleft.get(right,None) + if left==None: + # if parent has file but we don't: add to right set + r.append(right) + # change is already handled when comparing child against parent + return l,c,r + +def get_filechanges(repo,revision,parents,mleft): + """Given some repository and revision, find all changed/deleted files.""" + l,c,r=[],[],[] + for p in parents: + if p<0: continue + mright=repo.changectx(p).manifest() + l,c,r=split_dict(mleft,mright,l,c,r) + l.sort() + c.sort() + r.sort() + return l,c,r + +def get_author(logmessage,committer,authors): + """As git distincts between author and committer of a patch, try to + extract author by detecting Signed-off-by lines. + + This walks from the end of the log message towards the top skipping + empty lines. Upon the first non-empty line, it walks all Signed-off-by + lines upwards to find the first one. For that (if found), it extracts + authorship information the usual way (authors table, cleaning, etc.) + + If no Signed-off-by line is found, this defaults to the committer. + + This may sound stupid (and it somehow is), but in log messages we + accidentially may have lines in the middle starting with + "Signed-off-by: foo" and thus matching our detection regex. Prevent + that.""" + + loglines=logmessage.split('\n') + i=len(loglines) + # from tail walk to top skipping empty lines + while i>=0: + i-=1 + if len(loglines[i].strip())==0: continue + break + if i>=0: + # walk further upwards to find first sob line, store in 'first' + first=None + while i>=0: + m=sob_re.match(loglines[i]) + if m==None: break + first=m + i-=1 + # if the last non-empty line matches our Signed-Off-by regex: extract username + if first!=None: + r=fixup_user(first.group(1),authors) + return r + return committer + +def export_file_contents(ctx,manifest,files,hgtags,encoding=''): + count=0 + max=len(files) + for file in files: + # Skip .hgtags files. They only get us in trouble. + if not hgtags and file == ".hgtags": + sys.stderr.write('Skip %s\n' % (file)) + continue + d=ctx.filectx(file).data() + if encoding: + filename=file.decode(encoding).encode('utf8') + else: + filename=file + wr('M %s inline %s' % (gitmode(manifest.flags(file)), + strip_leading_slash(filename))) + wr('data %d' % len(d)) # had some trouble with size() + wr(d) + count+=1 + if count%cfg_export_boundary==0: + sys.stderr.write('Exported %d/%d files\n' % (count,max)) + if max>cfg_export_boundary: + sys.stderr.write('Exported %d/%d files\n' % (count,max)) + +def sanitize_name(name,what="branch"): + """Sanitize input roughly according to git-check-ref-format(1)""" + + def dot(name): + if name[0] == '.': return '_'+name[1:] + return name + + n=name + p=re.compile('([[ ~^:?\\\\*]|\.\.)') + n=p.sub('_', n) + if n[-1] in ('/', '.'): n=n[:-1]+'_' + n='/'.join(map(dot,n.split('/'))) + p=re.compile('_+') + n=p.sub('_', n) + + if n!=name: + sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n)) + return n + +def strip_leading_slash(filename): + if filename[0] == '/': + return filename[1:] + return filename + +def export_commit(ui,repo,revision,old_marks,max,count,authors, + branchesmap,sob,brmap,hgtags,notes,encoding='',fn_encoding=''): + def get_branchname(name): + if brmap.has_key(name): + return brmap[name] + n=sanitize_name(branchesmap.get(name,name)) + brmap[name]=n + return n + + (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding) + + branch=get_branchname(branch) + + parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0] + + if len(parents)==0 and revision != 0: + wr('reset refs/heads/%s' % branch) + + wr('commit refs/heads/%s' % branch) + wr('mark :%d' % (revision+1)) + if sob: + wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone)) + wr('committer %s %d %s' % (user,time,timezone)) + wr('data %d' % (len(desc)+1)) # wtf? + wr(desc) + wr() + + ctx=repo.changectx(str(revision)) + man=ctx.manifest() + added,changed,removed,type=[],[],[],'' + + if len(parents) == 0: + # first revision: feed in full manifest + added=man.keys() + added.sort() + type='full' + else: + wr('from %s' % revnum_to_revref(parents[0], old_marks)) + if len(parents) == 1: + # later non-merge revision: feed in changed manifest + # if we have exactly one parent, just take the changes from the + # manifest without expensively comparing checksums + f=repo.status(repo.lookup(parents[0]),revnode)[:3] + added,changed,removed=f[1],f[0],f[2] + type='simple delta' + else: # a merge with two parents + wr('merge %s' % revnum_to_revref(parents[1], old_marks)) + # later merge revision: feed in changed manifest + # for many files comparing checksums is expensive so only do it for + # merges where we really need it due to hg's revlog logic + added,changed,removed=get_filechanges(repo,revision,parents,man) + type='thorough delta' + + sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' % + (branch,type,revision+1,max,len(added),len(changed),len(removed))) + + if fn_encoding: + removed=[r.decode(fn_encoding).encode('utf8') for r in removed] + + removed=[strip_leading_slash(x) for x in removed] + + map(lambda r: wr('D %s' % r),removed) + export_file_contents(ctx,man,added,hgtags,fn_encoding) + export_file_contents(ctx,man,changed,hgtags,fn_encoding) + wr() + + count=checkpoint(count) + count=generate_note(user,time,timezone,revision,ctx,count,notes) + return count + +def generate_note(user,time,timezone,revision,ctx,count,notes): + if not notes: + return count + wr('commit refs/notes/hg') + wr('committer %s %d %s' % (user,time,timezone)) + wr('data 0') + wr('N inline :%d' % (revision+1)) + hg_hash=ctx.hex() + wr('data %d' % (len(hg_hash))) + wr_no_nl(hg_hash) + wr() + return checkpoint(count) + +def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap): + l=repo.tagslist() + for tag,node in l: + # Remap the branch name + tag=sanitize_name(tagsmap.get(tag,tag),"tag") + # ignore latest revision + if tag=='tip': continue + # ignore tags to nodes that are missing (ie, 'in the future') + if node.encode('hex_codec') not in mapping_cache: + sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec'))) + continue + + rev=int(mapping_cache[node.encode('hex_codec')]) + + ref=revnum_to_revref(rev, old_marks) + if ref==None: + sys.stderr.write('Failed to find reference for creating tag' + ' %s at r%d\n' % (tag,rev)) + continue + sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) + wr('reset refs/tags/%s' % tag) + wr('from %s' % ref) + wr() + count=checkpoint(count) + return count + +def load_mapping(name, filename): + cache={} + if not os.path.exists(filename): + return cache + f=open(filename,'r') + l=0 + a=0 + lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$') + for line in f.readlines(): + l+=1 + line=line.strip() + if line=='' or line[0]=='#': + continue + m=lre.match(line) + if m==None: + sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) + continue + # put key:value in cache, key without ^: + cache[m.group(1).strip()]=m.group(2).strip() + a+=1 + f.close() + sys.stderr.write('Loaded %d %s\n' % (a, name)) + return cache + +def branchtip(repo, heads): + '''return the tipmost branch head in heads''' + tip = heads[-1] + for h in reversed(heads): + if 'close' not in repo.changelog.read(h)[5]: + tip = h + break + return tip + +def verify_heads(ui,repo,cache,force): + branches={} + for bn, heads in repo.branchmap().iteritems(): + branches[bn] = branchtip(repo, heads) + l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()] + l.sort() + + # get list of hg's branches to verify, don't take all git has + for _,_,b in l: + b=get_branch(b) + sha1=get_git_sha1(b) + c=cache.get(b) + if sha1!=c: + sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:' + '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) + if not force: return False + + # verify that branch has exactly one head + t={} + for h in repo.heads(): + (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h) + if t.get(branch,False): + sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' % + repo.changelog.rev(h)) + if not force: return False + t[branch]=True + + return True + +def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile, + authors={},branchesmap={},tagsmap={}, + sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''): + _max=int(m) + + old_marks=load_cache(marksfile,lambda s: int(s)-1) + mapping_cache=load_cache(mappingfile) + heads_cache=load_cache(headsfile) + state_cache=load_cache(tipfile) + + ui,repo=setup_repo(repourl) + + if not verify_heads(ui,repo,heads_cache,force): + return 1 + + try: + tip=repo.changelog.count() + except AttributeError: + tip=len(repo) + + min=int(state_cache.get('tip',0)) + max=_max + if _max<0 or max>tip: + max=tip + + for rev in range(0,max): + (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors) + mapping_cache[revnode.encode('hex_codec')] = str(rev) + + + c=0 + brmap={} + for rev in range(min,max): + c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap, + sob,brmap,hgtags,notes,encoding,fn_encoding) + + state_cache['tip']=max + state_cache['repo']=repourl + save_cache(tipfile,state_cache) + save_cache(mappingfile,mapping_cache) + + c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap) + + sys.stderr.write('Issued %d commands\n' % c) + + return 0 + +if __name__=='__main__': + def bail(parser,opt): + sys.stderr.write('Error: No %s option given\n' % opt) + parser.print_help() + sys.exit(2) + + parser=OptionParser() + + parser.add_option("-m","--max",type="int",dest="max", + help="Maximum hg revision to import") + parser.add_option("--mapping",dest="mappingfile", + help="File to read last run's hg-to-git SHA1 mapping") + parser.add_option("--marks",dest="marksfile", + help="File to read git-fast-import's marks from") + parser.add_option("--heads",dest="headsfile", + help="File to read last run's git heads from") + parser.add_option("--status",dest="statusfile", + help="File to read status from") + parser.add_option("-r","--repo",dest="repourl", + help="URL of repo to import") + parser.add_option("-s",action="store_true",dest="sob", + default=False,help="Enable parsing Signed-off-by lines") + parser.add_option("--hgtags",action="store_true",dest="hgtags", + default=False,help="Enable exporting .hgtags files") + parser.add_option("-A","--authors",dest="authorfile", + help="Read authormap from AUTHORFILE") + parser.add_option("-B","--branches",dest="branchesfile", + help="Read branch map from BRANCHESFILE") + parser.add_option("-T","--tags",dest="tagsfile", + help="Read tags map from TAGSFILE") + parser.add_option("-f","--force",action="store_true",dest="force", + default=False,help="Ignore validation errors by force") + parser.add_option("-M","--default-branch",dest="default_branch", + help="Set the default branch") + parser.add_option("-o","--origin",dest="origin_name", + help="use <name> as namespace to track upstream") + parser.add_option("--hg-hash",action="store_true",dest="notes", + default=False,help="Annotate commits with the hg hash as git notes in the hg namespace") + parser.add_option("-e",dest="encoding", + help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>") + parser.add_option("--fe",dest="fn_encoding", + help="Assume file names from Mercurial are encoded in <filename_encoding>") + + (options,args)=parser.parse_args() + + m=-1 + if options.max!=None: m=options.max + + if options.marksfile==None: bail(parser,'--marks') + if options.mappingfile==None: bail(parser,'--mapping') + if options.headsfile==None: bail(parser,'--heads') + if options.statusfile==None: bail(parser,'--status') + if options.repourl==None: bail(parser,'--repo') + + a={} + if options.authorfile!=None: + a=load_mapping('authors', options.authorfile) + + b={} + if options.branchesfile!=None: + b=load_mapping('branches', options.branchesfile) + + t={} + if options.tagsfile!=None: + t=load_mapping('tags', options.tagsfile) + + if options.default_branch!=None: + set_default_branch(options.default_branch) + + if options.origin_name!=None: + set_origin_name(options.origin_name) + + encoding='' + if options.encoding!=None: + encoding=options.encoding + + fn_encoding=encoding + if options.fn_encoding!=None: + fn_encoding=options.fn_encoding + + sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile, + options.headsfile, options.statusfile, + authors=a,branchesmap=b,tagsmap=t, + sob=options.sob,force=options.force,hgtags=options.hgtags, + notes=options.notes,encoding=encoding,fn_encoding=fn_encoding))