comparison extra/fast-export/hg-fast-export.py @ 1544:e9e55585ebf2 feature_1136

Add fast-export
author Chris Cannam <chris.cannam@soundsoftware.ac.uk>
date Tue, 12 Jan 2016 13:39:30 +0000
parents
children 3ad53f43483d
comparison
equal deleted inserted replaced
1542:60acfbd8f6d6 1544:e9e55585ebf2
1 #!/usr/bin/env python
2
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
5
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
13
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
21
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
28
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
31
32 def wr_no_nl(msg=''):
33 if msg:
34 sys.stdout.write(msg)
35
36 def wr(msg=''):
37 wr_no_nl(msg)
38 sys.stdout.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
40
41 def checkpoint(count):
42 count=count+1
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44 sys.stderr.write("Checkpoint after %d commits\n" % count)
45 wr('checkpoint')
46 wr()
47 return count
48
49 def revnum_to_revref(rev, old_marks):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51 or a mark)"""
52 return old_marks.get(rev) or ':%d' % (rev+1)
53
54 def file_mismatch(f1,f2):
55 """See if two revisions of a file are not equal."""
56 return node.hex(f1)!=node.hex(f2)
57
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59 """Loop over our repository and find all changed and missing files."""
60 for left in dleft.keys():
61 right=dright.get(left,None)
62 if right==None:
63 # we have the file but our parent hasn't: add to left set
64 l.append(left)
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66 # we have it but checksums mismatch: add to center set
67 c.append(left)
68 for right in dright.keys():
69 left=dleft.get(right,None)
70 if left==None:
71 # if parent has file but we don't: add to right set
72 r.append(right)
73 # change is already handled when comparing child against parent
74 return l,c,r
75
76 def get_filechanges(repo,revision,parents,mleft):
77 """Given some repository and revision, find all changed/deleted files."""
78 l,c,r=[],[],[]
79 for p in parents:
80 if p<0: continue
81 mright=repo.changectx(p).manifest()
82 l,c,r=split_dict(mleft,mright,l,c,r)
83 l.sort()
84 c.sort()
85 r.sort()
86 return l,c,r
87
88 def get_author(logmessage,committer,authors):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
91
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
96
97 If no Signed-off-by line is found, this defaults to the committer.
98
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
102 that."""
103
104 loglines=logmessage.split('\n')
105 i=len(loglines)
106 # from tail walk to top skipping empty lines
107 while i>=0:
108 i-=1
109 if len(loglines[i].strip())==0: continue
110 break
111 if i>=0:
112 # walk further upwards to find first sob line, store in 'first'
113 first=None
114 while i>=0:
115 m=sob_re.match(loglines[i])
116 if m==None: break
117 first=m
118 i-=1
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
120 if first!=None:
121 r=fixup_user(first.group(1),authors)
122 return r
123 return committer
124
125 def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126 count=0
127 max=len(files)
128 for file in files:
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags and file == ".hgtags":
131 sys.stderr.write('Skip %s\n' % (file))
132 continue
133 d=ctx.filectx(file).data()
134 if encoding:
135 filename=file.decode(encoding).encode('utf8')
136 else:
137 filename=file
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
139 strip_leading_slash(filename)))
140 wr('data %d' % len(d)) # had some trouble with size()
141 wr(d)
142 count+=1
143 if count%cfg_export_boundary==0:
144 sys.stderr.write('Exported %d/%d files\n' % (count,max))
145 if max>cfg_export_boundary:
146 sys.stderr.write('Exported %d/%d files\n' % (count,max))
147
148 def sanitize_name(name,what="branch"):
149 """Sanitize input roughly according to git-check-ref-format(1)"""
150
151 def dot(name):
152 if name[0] == '.': return '_'+name[1:]
153 return name
154
155 n=name
156 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
157 n=p.sub('_', n)
158 if n[-1] in ('/', '.'): n=n[:-1]+'_'
159 n='/'.join(map(dot,n.split('/')))
160 p=re.compile('_+')
161 n=p.sub('_', n)
162
163 if n!=name:
164 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
165 return n
166
167 def strip_leading_slash(filename):
168 if filename[0] == '/':
169 return filename[1:]
170 return filename
171
172 def export_commit(ui,repo,revision,old_marks,max,count,authors,
173 branchesmap,sob,brmap,hgtags,notes,encoding='',fn_encoding=''):
174 def get_branchname(name):
175 if brmap.has_key(name):
176 return brmap[name]
177 n=sanitize_name(branchesmap.get(name,name))
178 brmap[name]=n
179 return n
180
181 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
182
183 branch=get_branchname(branch)
184
185 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
186
187 if len(parents)==0 and revision != 0:
188 wr('reset refs/heads/%s' % branch)
189
190 wr('commit refs/heads/%s' % branch)
191 wr('mark :%d' % (revision+1))
192 if sob:
193 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
194 wr('committer %s %d %s' % (user,time,timezone))
195 wr('data %d' % (len(desc)+1)) # wtf?
196 wr(desc)
197 wr()
198
199 ctx=repo.changectx(str(revision))
200 man=ctx.manifest()
201 added,changed,removed,type=[],[],[],''
202
203 if len(parents) == 0:
204 # first revision: feed in full manifest
205 added=man.keys()
206 added.sort()
207 type='full'
208 else:
209 wr('from %s' % revnum_to_revref(parents[0], old_marks))
210 if len(parents) == 1:
211 # later non-merge revision: feed in changed manifest
212 # if we have exactly one parent, just take the changes from the
213 # manifest without expensively comparing checksums
214 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
215 added,changed,removed=f[1],f[0],f[2]
216 type='simple delta'
217 else: # a merge with two parents
218 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
219 # later merge revision: feed in changed manifest
220 # for many files comparing checksums is expensive so only do it for
221 # merges where we really need it due to hg's revlog logic
222 added,changed,removed=get_filechanges(repo,revision,parents,man)
223 type='thorough delta'
224
225 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
226 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
227
228 if fn_encoding:
229 removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
230
231 removed=[strip_leading_slash(x) for x in removed]
232
233 map(lambda r: wr('D %s' % r),removed)
234 export_file_contents(ctx,man,added,hgtags,fn_encoding)
235 export_file_contents(ctx,man,changed,hgtags,fn_encoding)
236 wr()
237
238 count=checkpoint(count)
239 count=generate_note(user,time,timezone,revision,ctx,count,notes)
240 return count
241
242 def generate_note(user,time,timezone,revision,ctx,count,notes):
243 if not notes:
244 return count
245 wr('commit refs/notes/hg')
246 wr('committer %s %d %s' % (user,time,timezone))
247 wr('data 0')
248 wr('N inline :%d' % (revision+1))
249 hg_hash=ctx.hex()
250 wr('data %d' % (len(hg_hash)))
251 wr_no_nl(hg_hash)
252 wr()
253 return checkpoint(count)
254
255 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
256 l=repo.tagslist()
257 for tag,node in l:
258 # Remap the branch name
259 tag=sanitize_name(tagsmap.get(tag,tag),"tag")
260 # ignore latest revision
261 if tag=='tip': continue
262 # ignore tags to nodes that are missing (ie, 'in the future')
263 if node.encode('hex_codec') not in mapping_cache:
264 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
265 continue
266
267 rev=int(mapping_cache[node.encode('hex_codec')])
268
269 ref=revnum_to_revref(rev, old_marks)
270 if ref==None:
271 sys.stderr.write('Failed to find reference for creating tag'
272 ' %s at r%d\n' % (tag,rev))
273 continue
274 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
275 wr('reset refs/tags/%s' % tag)
276 wr('from %s' % ref)
277 wr()
278 count=checkpoint(count)
279 return count
280
281 def load_mapping(name, filename):
282 cache={}
283 if not os.path.exists(filename):
284 return cache
285 f=open(filename,'r')
286 l=0
287 a=0
288 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
289 for line in f.readlines():
290 l+=1
291 line=line.strip()
292 if line=='' or line[0]=='#':
293 continue
294 m=lre.match(line)
295 if m==None:
296 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
297 continue
298 # put key:value in cache, key without ^:
299 cache[m.group(1).strip()]=m.group(2).strip()
300 a+=1
301 f.close()
302 sys.stderr.write('Loaded %d %s\n' % (a, name))
303 return cache
304
305 def branchtip(repo, heads):
306 '''return the tipmost branch head in heads'''
307 tip = heads[-1]
308 for h in reversed(heads):
309 if 'close' not in repo.changelog.read(h)[5]:
310 tip = h
311 break
312 return tip
313
314 def verify_heads(ui,repo,cache,force):
315 branches={}
316 for bn, heads in repo.branchmap().iteritems():
317 branches[bn] = branchtip(repo, heads)
318 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
319 l.sort()
320
321 # get list of hg's branches to verify, don't take all git has
322 for _,_,b in l:
323 b=get_branch(b)
324 sha1=get_git_sha1(b)
325 c=cache.get(b)
326 if sha1!=c:
327 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
328 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
329 if not force: return False
330
331 # verify that branch has exactly one head
332 t={}
333 for h in repo.heads():
334 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
335 if t.get(branch,False):
336 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
337 repo.changelog.rev(h))
338 if not force: return False
339 t[branch]=True
340
341 return True
342
343 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
344 authors={},branchesmap={},tagsmap={},
345 sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''):
346 _max=int(m)
347
348 old_marks=load_cache(marksfile,lambda s: int(s)-1)
349 mapping_cache=load_cache(mappingfile)
350 heads_cache=load_cache(headsfile)
351 state_cache=load_cache(tipfile)
352
353 ui,repo=setup_repo(repourl)
354
355 if not verify_heads(ui,repo,heads_cache,force):
356 return 1
357
358 try:
359 tip=repo.changelog.count()
360 except AttributeError:
361 tip=len(repo)
362
363 min=int(state_cache.get('tip',0))
364 max=_max
365 if _max<0 or max>tip:
366 max=tip
367
368 for rev in range(0,max):
369 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
370 mapping_cache[revnode.encode('hex_codec')] = str(rev)
371
372
373 c=0
374 brmap={}
375 for rev in range(min,max):
376 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
377 sob,brmap,hgtags,notes,encoding,fn_encoding)
378
379 state_cache['tip']=max
380 state_cache['repo']=repourl
381 save_cache(tipfile,state_cache)
382 save_cache(mappingfile,mapping_cache)
383
384 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
385
386 sys.stderr.write('Issued %d commands\n' % c)
387
388 return 0
389
390 if __name__=='__main__':
391 def bail(parser,opt):
392 sys.stderr.write('Error: No %s option given\n' % opt)
393 parser.print_help()
394 sys.exit(2)
395
396 parser=OptionParser()
397
398 parser.add_option("-m","--max",type="int",dest="max",
399 help="Maximum hg revision to import")
400 parser.add_option("--mapping",dest="mappingfile",
401 help="File to read last run's hg-to-git SHA1 mapping")
402 parser.add_option("--marks",dest="marksfile",
403 help="File to read git-fast-import's marks from")
404 parser.add_option("--heads",dest="headsfile",
405 help="File to read last run's git heads from")
406 parser.add_option("--status",dest="statusfile",
407 help="File to read status from")
408 parser.add_option("-r","--repo",dest="repourl",
409 help="URL of repo to import")
410 parser.add_option("-s",action="store_true",dest="sob",
411 default=False,help="Enable parsing Signed-off-by lines")
412 parser.add_option("--hgtags",action="store_true",dest="hgtags",
413 default=False,help="Enable exporting .hgtags files")
414 parser.add_option("-A","--authors",dest="authorfile",
415 help="Read authormap from AUTHORFILE")
416 parser.add_option("-B","--branches",dest="branchesfile",
417 help="Read branch map from BRANCHESFILE")
418 parser.add_option("-T","--tags",dest="tagsfile",
419 help="Read tags map from TAGSFILE")
420 parser.add_option("-f","--force",action="store_true",dest="force",
421 default=False,help="Ignore validation errors by force")
422 parser.add_option("-M","--default-branch",dest="default_branch",
423 help="Set the default branch")
424 parser.add_option("-o","--origin",dest="origin_name",
425 help="use <name> as namespace to track upstream")
426 parser.add_option("--hg-hash",action="store_true",dest="notes",
427 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
428 parser.add_option("-e",dest="encoding",
429 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
430 parser.add_option("--fe",dest="fn_encoding",
431 help="Assume file names from Mercurial are encoded in <filename_encoding>")
432
433 (options,args)=parser.parse_args()
434
435 m=-1
436 if options.max!=None: m=options.max
437
438 if options.marksfile==None: bail(parser,'--marks')
439 if options.mappingfile==None: bail(parser,'--mapping')
440 if options.headsfile==None: bail(parser,'--heads')
441 if options.statusfile==None: bail(parser,'--status')
442 if options.repourl==None: bail(parser,'--repo')
443
444 a={}
445 if options.authorfile!=None:
446 a=load_mapping('authors', options.authorfile)
447
448 b={}
449 if options.branchesfile!=None:
450 b=load_mapping('branches', options.branchesfile)
451
452 t={}
453 if options.tagsfile!=None:
454 t=load_mapping('tags', options.tagsfile)
455
456 if options.default_branch!=None:
457 set_default_branch(options.default_branch)
458
459 if options.origin_name!=None:
460 set_origin_name(options.origin_name)
461
462 encoding=''
463 if options.encoding!=None:
464 encoding=options.encoding
465
466 fn_encoding=encoding
467 if options.fn_encoding!=None:
468 fn_encoding=options.fn_encoding
469
470 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
471 options.headsfile, options.statusfile,
472 authors=a,branchesmap=b,tagsmap=t,
473 sob=options.sob,force=options.force,hgtags=options.hgtags,
474 notes=options.notes,encoding=encoding,fn_encoding=fn_encoding))