Mercurial > hg > soundsoftware-site
comparison extra/fast-export/hg-fast-export.py @ 1544:e9e55585ebf2 feature_1136
Add fast-export
author | Chris Cannam <chris.cannam@soundsoftware.ac.uk> |
---|---|
date | Tue, 12 Jan 2016 13:39:30 +0000 |
parents | |
children | 3ad53f43483d |
comparison
equal
deleted
inserted
replaced
1542:60acfbd8f6d6 | 1544:e9e55585ebf2 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others. | |
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php> | |
5 | |
6 from mercurial import node | |
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset | |
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name | |
9 from optparse import OptionParser | |
10 import re | |
11 import sys | |
12 import os | |
13 | |
14 if sys.platform == "win32": | |
15 # On Windows, sys.stdout is initially opened in text mode, which means that | |
16 # when a LF (\n) character is written to sys.stdout, it will be converted | |
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific | |
18 # code to change the mode of sys.stdout to binary. | |
19 import msvcrt | |
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) | |
21 | |
22 # silly regex to catch Signed-off-by lines in log message | |
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$') | |
24 # insert 'checkpoint' command after this many commits or none at all if 0 | |
25 cfg_checkpoint_count=0 | |
26 # write some progress message every this many file contents written | |
27 cfg_export_boundary=1000 | |
28 | |
29 def gitmode(flags): | |
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644' | |
31 | |
32 def wr_no_nl(msg=''): | |
33 if msg: | |
34 sys.stdout.write(msg) | |
35 | |
36 def wr(msg=''): | |
37 wr_no_nl(msg) | |
38 sys.stdout.write('\n') | |
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) | |
40 | |
41 def checkpoint(count): | |
42 count=count+1 | |
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0: | |
44 sys.stderr.write("Checkpoint after %d commits\n" % count) | |
45 wr('checkpoint') | |
46 wr() | |
47 return count | |
48 | |
49 def revnum_to_revref(rev, old_marks): | |
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1 | |
51 or a mark)""" | |
52 return old_marks.get(rev) or ':%d' % (rev+1) | |
53 | |
54 def file_mismatch(f1,f2): | |
55 """See if two revisions of a file are not equal.""" | |
56 return node.hex(f1)!=node.hex(f2) | |
57 | |
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch): | |
59 """Loop over our repository and find all changed and missing files.""" | |
60 for left in dleft.keys(): | |
61 right=dright.get(left,None) | |
62 if right==None: | |
63 # we have the file but our parent hasn't: add to left set | |
64 l.append(left) | |
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)): | |
66 # we have it but checksums mismatch: add to center set | |
67 c.append(left) | |
68 for right in dright.keys(): | |
69 left=dleft.get(right,None) | |
70 if left==None: | |
71 # if parent has file but we don't: add to right set | |
72 r.append(right) | |
73 # change is already handled when comparing child against parent | |
74 return l,c,r | |
75 | |
76 def get_filechanges(repo,revision,parents,mleft): | |
77 """Given some repository and revision, find all changed/deleted files.""" | |
78 l,c,r=[],[],[] | |
79 for p in parents: | |
80 if p<0: continue | |
81 mright=repo.changectx(p).manifest() | |
82 l,c,r=split_dict(mleft,mright,l,c,r) | |
83 l.sort() | |
84 c.sort() | |
85 r.sort() | |
86 return l,c,r | |
87 | |
88 def get_author(logmessage,committer,authors): | |
89 """As git distincts between author and committer of a patch, try to | |
90 extract author by detecting Signed-off-by lines. | |
91 | |
92 This walks from the end of the log message towards the top skipping | |
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by | |
94 lines upwards to find the first one. For that (if found), it extracts | |
95 authorship information the usual way (authors table, cleaning, etc.) | |
96 | |
97 If no Signed-off-by line is found, this defaults to the committer. | |
98 | |
99 This may sound stupid (and it somehow is), but in log messages we | |
100 accidentially may have lines in the middle starting with | |
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent | |
102 that.""" | |
103 | |
104 loglines=logmessage.split('\n') | |
105 i=len(loglines) | |
106 # from tail walk to top skipping empty lines | |
107 while i>=0: | |
108 i-=1 | |
109 if len(loglines[i].strip())==0: continue | |
110 break | |
111 if i>=0: | |
112 # walk further upwards to find first sob line, store in 'first' | |
113 first=None | |
114 while i>=0: | |
115 m=sob_re.match(loglines[i]) | |
116 if m==None: break | |
117 first=m | |
118 i-=1 | |
119 # if the last non-empty line matches our Signed-Off-by regex: extract username | |
120 if first!=None: | |
121 r=fixup_user(first.group(1),authors) | |
122 return r | |
123 return committer | |
124 | |
125 def export_file_contents(ctx,manifest,files,hgtags,encoding=''): | |
126 count=0 | |
127 max=len(files) | |
128 for file in files: | |
129 # Skip .hgtags files. They only get us in trouble. | |
130 if not hgtags and file == ".hgtags": | |
131 sys.stderr.write('Skip %s\n' % (file)) | |
132 continue | |
133 d=ctx.filectx(file).data() | |
134 if encoding: | |
135 filename=file.decode(encoding).encode('utf8') | |
136 else: | |
137 filename=file | |
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)), | |
139 strip_leading_slash(filename))) | |
140 wr('data %d' % len(d)) # had some trouble with size() | |
141 wr(d) | |
142 count+=1 | |
143 if count%cfg_export_boundary==0: | |
144 sys.stderr.write('Exported %d/%d files\n' % (count,max)) | |
145 if max>cfg_export_boundary: | |
146 sys.stderr.write('Exported %d/%d files\n' % (count,max)) | |
147 | |
148 def sanitize_name(name,what="branch"): | |
149 """Sanitize input roughly according to git-check-ref-format(1)""" | |
150 | |
151 def dot(name): | |
152 if name[0] == '.': return '_'+name[1:] | |
153 return name | |
154 | |
155 n=name | |
156 p=re.compile('([[ ~^:?\\\\*]|\.\.)') | |
157 n=p.sub('_', n) | |
158 if n[-1] in ('/', '.'): n=n[:-1]+'_' | |
159 n='/'.join(map(dot,n.split('/'))) | |
160 p=re.compile('_+') | |
161 n=p.sub('_', n) | |
162 | |
163 if n!=name: | |
164 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n)) | |
165 return n | |
166 | |
167 def strip_leading_slash(filename): | |
168 if filename[0] == '/': | |
169 return filename[1:] | |
170 return filename | |
171 | |
172 def export_commit(ui,repo,revision,old_marks,max,count,authors, | |
173 branchesmap,sob,brmap,hgtags,notes,encoding='',fn_encoding=''): | |
174 def get_branchname(name): | |
175 if brmap.has_key(name): | |
176 return brmap[name] | |
177 n=sanitize_name(branchesmap.get(name,name)) | |
178 brmap[name]=n | |
179 return n | |
180 | |
181 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding) | |
182 | |
183 branch=get_branchname(branch) | |
184 | |
185 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0] | |
186 | |
187 if len(parents)==0 and revision != 0: | |
188 wr('reset refs/heads/%s' % branch) | |
189 | |
190 wr('commit refs/heads/%s' % branch) | |
191 wr('mark :%d' % (revision+1)) | |
192 if sob: | |
193 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone)) | |
194 wr('committer %s %d %s' % (user,time,timezone)) | |
195 wr('data %d' % (len(desc)+1)) # wtf? | |
196 wr(desc) | |
197 wr() | |
198 | |
199 ctx=repo.changectx(str(revision)) | |
200 man=ctx.manifest() | |
201 added,changed,removed,type=[],[],[],'' | |
202 | |
203 if len(parents) == 0: | |
204 # first revision: feed in full manifest | |
205 added=man.keys() | |
206 added.sort() | |
207 type='full' | |
208 else: | |
209 wr('from %s' % revnum_to_revref(parents[0], old_marks)) | |
210 if len(parents) == 1: | |
211 # later non-merge revision: feed in changed manifest | |
212 # if we have exactly one parent, just take the changes from the | |
213 # manifest without expensively comparing checksums | |
214 f=repo.status(repo.lookup(parents[0]),revnode)[:3] | |
215 added,changed,removed=f[1],f[0],f[2] | |
216 type='simple delta' | |
217 else: # a merge with two parents | |
218 wr('merge %s' % revnum_to_revref(parents[1], old_marks)) | |
219 # later merge revision: feed in changed manifest | |
220 # for many files comparing checksums is expensive so only do it for | |
221 # merges where we really need it due to hg's revlog logic | |
222 added,changed,removed=get_filechanges(repo,revision,parents,man) | |
223 type='thorough delta' | |
224 | |
225 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' % | |
226 (branch,type,revision+1,max,len(added),len(changed),len(removed))) | |
227 | |
228 if fn_encoding: | |
229 removed=[r.decode(fn_encoding).encode('utf8') for r in removed] | |
230 | |
231 removed=[strip_leading_slash(x) for x in removed] | |
232 | |
233 map(lambda r: wr('D %s' % r),removed) | |
234 export_file_contents(ctx,man,added,hgtags,fn_encoding) | |
235 export_file_contents(ctx,man,changed,hgtags,fn_encoding) | |
236 wr() | |
237 | |
238 count=checkpoint(count) | |
239 count=generate_note(user,time,timezone,revision,ctx,count,notes) | |
240 return count | |
241 | |
242 def generate_note(user,time,timezone,revision,ctx,count,notes): | |
243 if not notes: | |
244 return count | |
245 wr('commit refs/notes/hg') | |
246 wr('committer %s %d %s' % (user,time,timezone)) | |
247 wr('data 0') | |
248 wr('N inline :%d' % (revision+1)) | |
249 hg_hash=ctx.hex() | |
250 wr('data %d' % (len(hg_hash))) | |
251 wr_no_nl(hg_hash) | |
252 wr() | |
253 return checkpoint(count) | |
254 | |
255 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap): | |
256 l=repo.tagslist() | |
257 for tag,node in l: | |
258 # Remap the branch name | |
259 tag=sanitize_name(tagsmap.get(tag,tag),"tag") | |
260 # ignore latest revision | |
261 if tag=='tip': continue | |
262 # ignore tags to nodes that are missing (ie, 'in the future') | |
263 if node.encode('hex_codec') not in mapping_cache: | |
264 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec'))) | |
265 continue | |
266 | |
267 rev=int(mapping_cache[node.encode('hex_codec')]) | |
268 | |
269 ref=revnum_to_revref(rev, old_marks) | |
270 if ref==None: | |
271 sys.stderr.write('Failed to find reference for creating tag' | |
272 ' %s at r%d\n' % (tag,rev)) | |
273 continue | |
274 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) | |
275 wr('reset refs/tags/%s' % tag) | |
276 wr('from %s' % ref) | |
277 wr() | |
278 count=checkpoint(count) | |
279 return count | |
280 | |
281 def load_mapping(name, filename): | |
282 cache={} | |
283 if not os.path.exists(filename): | |
284 return cache | |
285 f=open(filename,'r') | |
286 l=0 | |
287 a=0 | |
288 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$') | |
289 for line in f.readlines(): | |
290 l+=1 | |
291 line=line.strip() | |
292 if line=='' or line[0]=='#': | |
293 continue | |
294 m=lre.match(line) | |
295 if m==None: | |
296 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) | |
297 continue | |
298 # put key:value in cache, key without ^: | |
299 cache[m.group(1).strip()]=m.group(2).strip() | |
300 a+=1 | |
301 f.close() | |
302 sys.stderr.write('Loaded %d %s\n' % (a, name)) | |
303 return cache | |
304 | |
305 def branchtip(repo, heads): | |
306 '''return the tipmost branch head in heads''' | |
307 tip = heads[-1] | |
308 for h in reversed(heads): | |
309 if 'close' not in repo.changelog.read(h)[5]: | |
310 tip = h | |
311 break | |
312 return tip | |
313 | |
314 def verify_heads(ui,repo,cache,force): | |
315 branches={} | |
316 for bn, heads in repo.branchmap().iteritems(): | |
317 branches[bn] = branchtip(repo, heads) | |
318 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()] | |
319 l.sort() | |
320 | |
321 # get list of hg's branches to verify, don't take all git has | |
322 for _,_,b in l: | |
323 b=get_branch(b) | |
324 sha1=get_git_sha1(b) | |
325 c=cache.get(b) | |
326 if sha1!=c: | |
327 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:' | |
328 '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) | |
329 if not force: return False | |
330 | |
331 # verify that branch has exactly one head | |
332 t={} | |
333 for h in repo.heads(): | |
334 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h) | |
335 if t.get(branch,False): | |
336 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' % | |
337 repo.changelog.rev(h)) | |
338 if not force: return False | |
339 t[branch]=True | |
340 | |
341 return True | |
342 | |
343 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile, | |
344 authors={},branchesmap={},tagsmap={}, | |
345 sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''): | |
346 _max=int(m) | |
347 | |
348 old_marks=load_cache(marksfile,lambda s: int(s)-1) | |
349 mapping_cache=load_cache(mappingfile) | |
350 heads_cache=load_cache(headsfile) | |
351 state_cache=load_cache(tipfile) | |
352 | |
353 ui,repo=setup_repo(repourl) | |
354 | |
355 if not verify_heads(ui,repo,heads_cache,force): | |
356 return 1 | |
357 | |
358 try: | |
359 tip=repo.changelog.count() | |
360 except AttributeError: | |
361 tip=len(repo) | |
362 | |
363 min=int(state_cache.get('tip',0)) | |
364 max=_max | |
365 if _max<0 or max>tip: | |
366 max=tip | |
367 | |
368 for rev in range(0,max): | |
369 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors) | |
370 mapping_cache[revnode.encode('hex_codec')] = str(rev) | |
371 | |
372 | |
373 c=0 | |
374 brmap={} | |
375 for rev in range(min,max): | |
376 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap, | |
377 sob,brmap,hgtags,notes,encoding,fn_encoding) | |
378 | |
379 state_cache['tip']=max | |
380 state_cache['repo']=repourl | |
381 save_cache(tipfile,state_cache) | |
382 save_cache(mappingfile,mapping_cache) | |
383 | |
384 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap) | |
385 | |
386 sys.stderr.write('Issued %d commands\n' % c) | |
387 | |
388 return 0 | |
389 | |
390 if __name__=='__main__': | |
391 def bail(parser,opt): | |
392 sys.stderr.write('Error: No %s option given\n' % opt) | |
393 parser.print_help() | |
394 sys.exit(2) | |
395 | |
396 parser=OptionParser() | |
397 | |
398 parser.add_option("-m","--max",type="int",dest="max", | |
399 help="Maximum hg revision to import") | |
400 parser.add_option("--mapping",dest="mappingfile", | |
401 help="File to read last run's hg-to-git SHA1 mapping") | |
402 parser.add_option("--marks",dest="marksfile", | |
403 help="File to read git-fast-import's marks from") | |
404 parser.add_option("--heads",dest="headsfile", | |
405 help="File to read last run's git heads from") | |
406 parser.add_option("--status",dest="statusfile", | |
407 help="File to read status from") | |
408 parser.add_option("-r","--repo",dest="repourl", | |
409 help="URL of repo to import") | |
410 parser.add_option("-s",action="store_true",dest="sob", | |
411 default=False,help="Enable parsing Signed-off-by lines") | |
412 parser.add_option("--hgtags",action="store_true",dest="hgtags", | |
413 default=False,help="Enable exporting .hgtags files") | |
414 parser.add_option("-A","--authors",dest="authorfile", | |
415 help="Read authormap from AUTHORFILE") | |
416 parser.add_option("-B","--branches",dest="branchesfile", | |
417 help="Read branch map from BRANCHESFILE") | |
418 parser.add_option("-T","--tags",dest="tagsfile", | |
419 help="Read tags map from TAGSFILE") | |
420 parser.add_option("-f","--force",action="store_true",dest="force", | |
421 default=False,help="Ignore validation errors by force") | |
422 parser.add_option("-M","--default-branch",dest="default_branch", | |
423 help="Set the default branch") | |
424 parser.add_option("-o","--origin",dest="origin_name", | |
425 help="use <name> as namespace to track upstream") | |
426 parser.add_option("--hg-hash",action="store_true",dest="notes", | |
427 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace") | |
428 parser.add_option("-e",dest="encoding", | |
429 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>") | |
430 parser.add_option("--fe",dest="fn_encoding", | |
431 help="Assume file names from Mercurial are encoded in <filename_encoding>") | |
432 | |
433 (options,args)=parser.parse_args() | |
434 | |
435 m=-1 | |
436 if options.max!=None: m=options.max | |
437 | |
438 if options.marksfile==None: bail(parser,'--marks') | |
439 if options.mappingfile==None: bail(parser,'--mapping') | |
440 if options.headsfile==None: bail(parser,'--heads') | |
441 if options.statusfile==None: bail(parser,'--status') | |
442 if options.repourl==None: bail(parser,'--repo') | |
443 | |
444 a={} | |
445 if options.authorfile!=None: | |
446 a=load_mapping('authors', options.authorfile) | |
447 | |
448 b={} | |
449 if options.branchesfile!=None: | |
450 b=load_mapping('branches', options.branchesfile) | |
451 | |
452 t={} | |
453 if options.tagsfile!=None: | |
454 t=load_mapping('tags', options.tagsfile) | |
455 | |
456 if options.default_branch!=None: | |
457 set_default_branch(options.default_branch) | |
458 | |
459 if options.origin_name!=None: | |
460 set_origin_name(options.origin_name) | |
461 | |
462 encoding='' | |
463 if options.encoding!=None: | |
464 encoding=options.encoding | |
465 | |
466 fn_encoding=encoding | |
467 if options.fn_encoding!=None: | |
468 fn_encoding=options.fn_encoding | |
469 | |
470 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile, | |
471 options.headsfile, options.statusfile, | |
472 authors=a,branchesmap=b,tagsmap=t, | |
473 sob=options.sob,force=options.force,hgtags=options.hgtags, | |
474 notes=options.notes,encoding=encoding,fn_encoding=fn_encoding)) |