chris@1544: #!/usr/bin/env python chris@1544: chris@1544: # Copyright (c) 2007, 2008 Rocco Rutte and others. chris@1544: # License: MIT chris@1544: chris@1544: from mercurial import node chris@1544: from hg2git import setup_repo,fixup_user,get_branch,get_changeset chris@1544: from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name chris@1544: from optparse import OptionParser chris@1544: import re chris@1544: import sys chris@1544: import os chris@1544: chris@1544: if sys.platform == "win32": chris@1544: # On Windows, sys.stdout is initially opened in text mode, which means that chris@1544: # when a LF (\n) character is written to sys.stdout, it will be converted chris@1544: # into CRLF (\r\n). That makes git blow up, so use this platform-specific chris@1544: # code to change the mode of sys.stdout to binary. chris@1544: import msvcrt chris@1544: msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) chris@1544: chris@1544: # silly regex to catch Signed-off-by lines in log message chris@1544: sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$') chris@1544: # insert 'checkpoint' command after this many commits or none at all if 0 chris@1544: cfg_checkpoint_count=0 chris@1544: # write some progress message every this many file contents written chris@1544: cfg_export_boundary=1000 chris@1544: chris@1544: def gitmode(flags): chris@1544: return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644' chris@1544: chris@1544: def wr_no_nl(msg=''): chris@1544: if msg: chris@1544: sys.stdout.write(msg) chris@1544: chris@1544: def wr(msg=''): chris@1544: wr_no_nl(msg) chris@1544: sys.stdout.write('\n') chris@1544: #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) chris@1544: chris@1544: def checkpoint(count): chris@1544: count=count+1 chris@1544: if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0: chris@1544: sys.stderr.write("Checkpoint after %d commits\n" % count) chris@1544: wr('checkpoint') chris@1544: wr() chris@1544: return count chris@1544: chris@1544: def revnum_to_revref(rev, old_marks): chris@1544: """Convert an hg revnum to a git-fast-import rev reference (an SHA1 chris@1544: or a mark)""" chris@1544: return old_marks.get(rev) or ':%d' % (rev+1) chris@1544: chris@1544: def file_mismatch(f1,f2): chris@1544: """See if two revisions of a file are not equal.""" chris@1544: return node.hex(f1)!=node.hex(f2) chris@1544: chris@1544: def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch): chris@1544: """Loop over our repository and find all changed and missing files.""" chris@1544: for left in dleft.keys(): chris@1544: right=dright.get(left,None) chris@1544: if right==None: chris@1544: # we have the file but our parent hasn't: add to left set chris@1544: l.append(left) chris@1544: elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)): chris@1544: # we have it but checksums mismatch: add to center set chris@1544: c.append(left) chris@1544: for right in dright.keys(): chris@1544: left=dleft.get(right,None) chris@1544: if left==None: chris@1544: # if parent has file but we don't: add to right set chris@1544: r.append(right) chris@1544: # change is already handled when comparing child against parent chris@1544: return l,c,r chris@1544: chris@1544: def get_filechanges(repo,revision,parents,mleft): chris@1544: """Given some repository and revision, find all changed/deleted files.""" chris@1544: l,c,r=[],[],[] chris@1544: for p in parents: chris@1544: if p<0: continue chris@1544: mright=repo.changectx(p).manifest() chris@1544: l,c,r=split_dict(mleft,mright,l,c,r) chris@1544: l.sort() chris@1544: c.sort() chris@1544: r.sort() chris@1544: return l,c,r chris@1544: chris@1544: def get_author(logmessage,committer,authors): chris@1544: """As git distincts between author and committer of a patch, try to chris@1544: extract author by detecting Signed-off-by lines. chris@1544: chris@1544: This walks from the end of the log message towards the top skipping chris@1544: empty lines. Upon the first non-empty line, it walks all Signed-off-by chris@1544: lines upwards to find the first one. For that (if found), it extracts chris@1544: authorship information the usual way (authors table, cleaning, etc.) chris@1544: chris@1544: If no Signed-off-by line is found, this defaults to the committer. chris@1544: chris@1544: This may sound stupid (and it somehow is), but in log messages we chris@1544: accidentially may have lines in the middle starting with chris@1544: "Signed-off-by: foo" and thus matching our detection regex. Prevent chris@1544: that.""" chris@1544: chris@1544: loglines=logmessage.split('\n') chris@1544: i=len(loglines) chris@1544: # from tail walk to top skipping empty lines chris@1544: while i>=0: chris@1544: i-=1 chris@1544: if len(loglines[i].strip())==0: continue chris@1544: break chris@1544: if i>=0: chris@1544: # walk further upwards to find first sob line, store in 'first' chris@1544: first=None chris@1544: while i>=0: chris@1544: m=sob_re.match(loglines[i]) chris@1544: if m==None: break chris@1544: first=m chris@1544: i-=1 chris@1544: # if the last non-empty line matches our Signed-Off-by regex: extract username chris@1544: if first!=None: chris@1544: r=fixup_user(first.group(1),authors) chris@1544: return r chris@1544: return committer chris@1544: chris@1544: def export_file_contents(ctx,manifest,files,hgtags,encoding=''): chris@1544: count=0 chris@1544: max=len(files) chris@1544: for file in files: chris@1544: # Skip .hgtags files. They only get us in trouble. chris@1544: if not hgtags and file == ".hgtags": chris@1544: sys.stderr.write('Skip %s\n' % (file)) chris@1544: continue chris@1544: d=ctx.filectx(file).data() chris@1544: if encoding: chris@1544: filename=file.decode(encoding).encode('utf8') chris@1544: else: chris@1544: filename=file chris@1544: wr('M %s inline %s' % (gitmode(manifest.flags(file)), chris@1544: strip_leading_slash(filename))) chris@1544: wr('data %d' % len(d)) # had some trouble with size() chris@1544: wr(d) chris@1544: count+=1 chris@1544: if count%cfg_export_boundary==0: chris@1544: sys.stderr.write('Exported %d/%d files\n' % (count,max)) chris@1544: if max>cfg_export_boundary: chris@1544: sys.stderr.write('Exported %d/%d files\n' % (count,max)) chris@1544: Chris@1567: def sanitize_name(name,what="branch", mapping={}): chris@1544: """Sanitize input roughly according to git-check-ref-format(1)""" chris@1544: Chris@1567: # NOTE: Do not update this transform to work around Chris@1567: # incompatibilities on your platform. If you change it and it starts Chris@1567: # modifying names which previously were not touched it will break Chris@1567: # preexisting setups which are doing incremental imports. Chris@1567: # Chris@1567: # Use the -B and -T options to mangle branch and tag names Chris@1567: # instead. If you have a source repository where this is too much Chris@1567: # work to do manually, write a tool that does it for you. Chris@1567: chris@1544: def dot(name): chris@1544: if name[0] == '.': return '_'+name[1:] chris@1544: return name chris@1544: Chris@1567: n=mapping.get(name,name) chris@1544: p=re.compile('([[ ~^:?\\\\*]|\.\.)') chris@1544: n=p.sub('_', n) chris@1544: if n[-1] in ('/', '.'): n=n[:-1]+'_' chris@1544: n='/'.join(map(dot,n.split('/'))) chris@1544: p=re.compile('_+') chris@1544: n=p.sub('_', n) chris@1544: chris@1544: if n!=name: chris@1544: sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n)) chris@1544: return n chris@1544: chris@1544: def strip_leading_slash(filename): chris@1544: if filename[0] == '/': chris@1544: return filename[1:] chris@1544: return filename chris@1544: chris@1544: def export_commit(ui,repo,revision,old_marks,max,count,authors, Chris@1567: branchesmap,sob,brmap,hgtags,encoding='',fn_encoding=''): chris@1544: def get_branchname(name): chris@1544: if brmap.has_key(name): chris@1544: return brmap[name] Chris@1567: n=sanitize_name(name, "branch", branchesmap) chris@1544: brmap[name]=n chris@1544: return n chris@1544: chris@1544: (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding) chris@1544: chris@1544: branch=get_branchname(branch) chris@1544: chris@1544: parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0] chris@1544: chris@1544: if len(parents)==0 and revision != 0: chris@1544: wr('reset refs/heads/%s' % branch) chris@1544: chris@1544: wr('commit refs/heads/%s' % branch) chris@1544: wr('mark :%d' % (revision+1)) chris@1544: if sob: chris@1544: wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone)) chris@1544: wr('committer %s %d %s' % (user,time,timezone)) chris@1544: wr('data %d' % (len(desc)+1)) # wtf? chris@1544: wr(desc) chris@1544: wr() chris@1544: chris@1544: ctx=repo.changectx(str(revision)) chris@1544: man=ctx.manifest() chris@1544: added,changed,removed,type=[],[],[],'' chris@1544: chris@1544: if len(parents) == 0: chris@1544: # first revision: feed in full manifest chris@1544: added=man.keys() chris@1544: added.sort() chris@1544: type='full' chris@1544: else: chris@1544: wr('from %s' % revnum_to_revref(parents[0], old_marks)) chris@1544: if len(parents) == 1: chris@1544: # later non-merge revision: feed in changed manifest chris@1544: # if we have exactly one parent, just take the changes from the chris@1544: # manifest without expensively comparing checksums chris@1544: f=repo.status(repo.lookup(parents[0]),revnode)[:3] chris@1544: added,changed,removed=f[1],f[0],f[2] chris@1544: type='simple delta' chris@1544: else: # a merge with two parents chris@1544: wr('merge %s' % revnum_to_revref(parents[1], old_marks)) chris@1544: # later merge revision: feed in changed manifest chris@1544: # for many files comparing checksums is expensive so only do it for chris@1544: # merges where we really need it due to hg's revlog logic chris@1544: added,changed,removed=get_filechanges(repo,revision,parents,man) chris@1544: type='thorough delta' chris@1544: chris@1544: sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' % chris@1544: (branch,type,revision+1,max,len(added),len(changed),len(removed))) chris@1544: chris@1544: if fn_encoding: chris@1544: removed=[r.decode(fn_encoding).encode('utf8') for r in removed] chris@1544: chris@1544: removed=[strip_leading_slash(x) for x in removed] chris@1544: chris@1544: map(lambda r: wr('D %s' % r),removed) chris@1544: export_file_contents(ctx,man,added,hgtags,fn_encoding) chris@1544: export_file_contents(ctx,man,changed,hgtags,fn_encoding) chris@1544: wr() chris@1544: Chris@1567: return checkpoint(count) chris@1544: Chris@1567: def export_note(ui,repo,revision,count,authors,encoding,is_first): Chris@1567: (revnode,_,user,(time,timezone),_,_,_,_)=get_changeset(ui,repo,revision,authors,encoding) Chris@1567: Chris@1567: parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0] Chris@1567: chris@1544: wr('commit refs/notes/hg') chris@1544: wr('committer %s %d %s' % (user,time,timezone)) chris@1544: wr('data 0') Chris@1567: if is_first: Chris@1567: wr('from refs/notes/hg^0') chris@1544: wr('N inline :%d' % (revision+1)) Chris@1567: hg_hash=repo.changectx(str(revision)).hex() chris@1544: wr('data %d' % (len(hg_hash))) chris@1544: wr_no_nl(hg_hash) chris@1544: wr() chris@1544: return checkpoint(count) Chris@1567: Chris@1567: wr('data %d' % (len(desc)+1)) # wtf? Chris@1567: wr(desc) Chris@1567: wr() Chris@1567: chris@1544: def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap): chris@1544: l=repo.tagslist() chris@1544: for tag,node in l: chris@1544: # Remap the branch name Chris@1567: tag=sanitize_name(tag,"tag",tagsmap) chris@1544: # ignore latest revision chris@1544: if tag=='tip': continue chris@1544: # ignore tags to nodes that are missing (ie, 'in the future') chris@1544: if node.encode('hex_codec') not in mapping_cache: chris@1544: sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec'))) chris@1544: continue chris@1544: chris@1544: rev=int(mapping_cache[node.encode('hex_codec')]) chris@1544: chris@1544: ref=revnum_to_revref(rev, old_marks) chris@1544: if ref==None: chris@1544: sys.stderr.write('Failed to find reference for creating tag' chris@1544: ' %s at r%d\n' % (tag,rev)) chris@1544: continue chris@1544: sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) chris@1544: wr('reset refs/tags/%s' % tag) chris@1544: wr('from %s' % ref) chris@1544: wr() chris@1544: count=checkpoint(count) chris@1544: return count chris@1544: chris@1544: def load_mapping(name, filename): chris@1544: cache={} chris@1544: if not os.path.exists(filename): Chris@1567: sys.stderr.write('Could not open mapping file [%s]\n' % (filename)) chris@1544: return cache chris@1544: f=open(filename,'r') chris@1544: l=0 chris@1544: a=0 chris@1544: lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$') chris@1544: for line in f.readlines(): chris@1544: l+=1 chris@1544: line=line.strip() chris@1544: if line=='' or line[0]=='#': chris@1544: continue chris@1544: m=lre.match(line) chris@1544: if m==None: chris@1544: sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) chris@1544: continue chris@1544: # put key:value in cache, key without ^: chris@1544: cache[m.group(1).strip()]=m.group(2).strip() chris@1544: a+=1 chris@1544: f.close() chris@1544: sys.stderr.write('Loaded %d %s\n' % (a, name)) chris@1544: return cache chris@1544: chris@1544: def branchtip(repo, heads): chris@1544: '''return the tipmost branch head in heads''' chris@1544: tip = heads[-1] chris@1544: for h in reversed(heads): chris@1544: if 'close' not in repo.changelog.read(h)[5]: chris@1544: tip = h chris@1544: break chris@1544: return tip chris@1544: Chris@1567: def verify_heads(ui,repo,cache,force,branchesmap): chris@1544: branches={} chris@1544: for bn, heads in repo.branchmap().iteritems(): chris@1544: branches[bn] = branchtip(repo, heads) chris@1544: l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()] chris@1544: l.sort() chris@1544: chris@1544: # get list of hg's branches to verify, don't take all git has chris@1544: for _,_,b in l: chris@1544: b=get_branch(b) Chris@1567: sanitized_name=sanitize_name(b,"branch",branchesmap) Chris@1567: sha1=get_git_sha1(sanitized_name) Chris@1567: c=cache.get(sanitized_name) chris@1544: if sha1!=c: chris@1544: sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:' chris@1544: '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) chris@1544: if not force: return False chris@1544: chris@1544: # verify that branch has exactly one head chris@1544: t={} chris@1544: for h in repo.heads(): chris@1544: (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h) chris@1544: if t.get(branch,False): chris@1544: sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' % chris@1544: repo.changelog.rev(h)) chris@1544: if not force: return False chris@1544: t[branch]=True chris@1544: chris@1544: return True chris@1544: chris@1544: def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile, chris@1544: authors={},branchesmap={},tagsmap={}, chris@1544: sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''): Chris@1567: def check_cache(filename, contents): Chris@1567: if len(contents) == 0: Chris@1567: sys.stderr.write('Warning: %s does not contain any data, this will probably make an incremental import fail\n' % filename) Chris@1567: chris@1544: _max=int(m) chris@1544: chris@1544: old_marks=load_cache(marksfile,lambda s: int(s)-1) chris@1544: mapping_cache=load_cache(mappingfile) chris@1544: heads_cache=load_cache(headsfile) chris@1544: state_cache=load_cache(tipfile) chris@1544: Chris@1567: if len(state_cache) != 0: Chris@1567: for (name, data) in [(marksfile, old_marks), Chris@1567: (mappingfile, mapping_cache), Chris@1567: (headsfile, state_cache)]: Chris@1567: check_cache(name, data) Chris@1567: chris@1544: ui,repo=setup_repo(repourl) chris@1544: Chris@1567: if not verify_heads(ui,repo,heads_cache,force,branchesmap): chris@1544: return 1 chris@1544: chris@1544: try: chris@1544: tip=repo.changelog.count() chris@1544: except AttributeError: chris@1544: tip=len(repo) chris@1544: chris@1544: min=int(state_cache.get('tip',0)) chris@1544: max=_max chris@1544: if _max<0 or max>tip: chris@1544: max=tip chris@1544: chris@1544: for rev in range(0,max): chris@1544: (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors) chris@1544: mapping_cache[revnode.encode('hex_codec')] = str(rev) chris@1544: chris@1544: chris@1544: c=0 chris@1544: brmap={} chris@1544: for rev in range(min,max): chris@1544: c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap, Chris@1567: sob,brmap,hgtags,encoding,fn_encoding) Chris@1567: if notes: Chris@1567: for rev in range(min,max): Chris@1567: c=export_note(ui,repo,rev,c,authors, encoding, rev == min and min != 0) chris@1544: chris@1544: state_cache['tip']=max chris@1544: state_cache['repo']=repourl chris@1544: save_cache(tipfile,state_cache) chris@1544: save_cache(mappingfile,mapping_cache) chris@1544: chris@1544: c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap) chris@1544: chris@1544: sys.stderr.write('Issued %d commands\n' % c) chris@1544: chris@1544: return 0 chris@1544: chris@1544: if __name__=='__main__': chris@1544: def bail(parser,opt): chris@1544: sys.stderr.write('Error: No %s option given\n' % opt) chris@1544: parser.print_help() chris@1544: sys.exit(2) chris@1544: chris@1544: parser=OptionParser() chris@1544: chris@1544: parser.add_option("-m","--max",type="int",dest="max", chris@1544: help="Maximum hg revision to import") chris@1544: parser.add_option("--mapping",dest="mappingfile", chris@1544: help="File to read last run's hg-to-git SHA1 mapping") chris@1544: parser.add_option("--marks",dest="marksfile", chris@1544: help="File to read git-fast-import's marks from") chris@1544: parser.add_option("--heads",dest="headsfile", chris@1544: help="File to read last run's git heads from") chris@1544: parser.add_option("--status",dest="statusfile", chris@1544: help="File to read status from") chris@1544: parser.add_option("-r","--repo",dest="repourl", chris@1544: help="URL of repo to import") chris@1544: parser.add_option("-s",action="store_true",dest="sob", chris@1544: default=False,help="Enable parsing Signed-off-by lines") chris@1544: parser.add_option("--hgtags",action="store_true",dest="hgtags", chris@1544: default=False,help="Enable exporting .hgtags files") chris@1544: parser.add_option("-A","--authors",dest="authorfile", chris@1544: help="Read authormap from AUTHORFILE") chris@1544: parser.add_option("-B","--branches",dest="branchesfile", chris@1544: help="Read branch map from BRANCHESFILE") chris@1544: parser.add_option("-T","--tags",dest="tagsfile", chris@1544: help="Read tags map from TAGSFILE") chris@1544: parser.add_option("-f","--force",action="store_true",dest="force", chris@1544: default=False,help="Ignore validation errors by force") chris@1544: parser.add_option("-M","--default-branch",dest="default_branch", chris@1544: help="Set the default branch") chris@1544: parser.add_option("-o","--origin",dest="origin_name", chris@1544: help="use as namespace to track upstream") chris@1544: parser.add_option("--hg-hash",action="store_true",dest="notes", chris@1544: default=False,help="Annotate commits with the hg hash as git notes in the hg namespace") chris@1544: parser.add_option("-e",dest="encoding", chris@1544: help="Assume commit and author strings retrieved from Mercurial are encoded in ") chris@1544: parser.add_option("--fe",dest="fn_encoding", chris@1544: help="Assume file names from Mercurial are encoded in ") chris@1544: chris@1544: (options,args)=parser.parse_args() chris@1544: chris@1544: m=-1 chris@1544: if options.max!=None: m=options.max chris@1544: chris@1544: if options.marksfile==None: bail(parser,'--marks') chris@1544: if options.mappingfile==None: bail(parser,'--mapping') chris@1544: if options.headsfile==None: bail(parser,'--heads') chris@1544: if options.statusfile==None: bail(parser,'--status') chris@1544: if options.repourl==None: bail(parser,'--repo') chris@1544: chris@1544: a={} chris@1544: if options.authorfile!=None: chris@1544: a=load_mapping('authors', options.authorfile) chris@1544: chris@1544: b={} chris@1544: if options.branchesfile!=None: chris@1544: b=load_mapping('branches', options.branchesfile) chris@1544: chris@1544: t={} chris@1544: if options.tagsfile!=None: chris@1544: t=load_mapping('tags', options.tagsfile) chris@1544: chris@1544: if options.default_branch!=None: chris@1544: set_default_branch(options.default_branch) chris@1544: chris@1544: if options.origin_name!=None: chris@1544: set_origin_name(options.origin_name) chris@1544: chris@1544: encoding='' chris@1544: if options.encoding!=None: chris@1544: encoding=options.encoding chris@1544: chris@1544: fn_encoding=encoding chris@1544: if options.fn_encoding!=None: chris@1544: fn_encoding=options.fn_encoding chris@1544: chris@1544: sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile, chris@1544: options.headsfile, options.statusfile, chris@1544: authors=a,branchesmap=b,tagsmap=t, chris@1544: sob=options.sob,force=options.force,hgtags=options.hgtags, chris@1544: notes=options.notes,encoding=encoding,fn_encoding=fn_encoding))