To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / extra / fast-export / hg-fast-export.py @ 1547:bca3b5e5bbf2

History | View | Annotate | Download (15.2 KB)

1
#!/usr/bin/env python
2

    
3
# Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4
# License: MIT <http://www.opensource.org/licenses/mit-license.php>
5

    
6
from mercurial import node
7
from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8
from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9
from optparse import OptionParser
10
import re
11
import sys
12
import os
13

    
14
if sys.platform == "win32":
15
  # On Windows, sys.stdout is initially opened in text mode, which means that
16
  # when a LF (\n) character is written to sys.stdout, it will be converted
17
  # into CRLF (\r\n).  That makes git blow up, so use this platform-specific
18
  # code to change the mode of sys.stdout to binary.
19
  import msvcrt
20
  msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
21

    
22
# silly regex to catch Signed-off-by lines in log message
23
sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24
# insert 'checkpoint' command after this many commits or none at all if 0
25
cfg_checkpoint_count=0
26
# write some progress message every this many file contents written
27
cfg_export_boundary=1000
28

    
29
def gitmode(flags):
30
  return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
31

    
32
def wr_no_nl(msg=''):
33
  if msg:
34
    sys.stdout.write(msg)
35

    
36
def wr(msg=''):
37
  wr_no_nl(msg)
38
  sys.stdout.write('\n')
39
  #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
40

    
41
def checkpoint(count):
42
  count=count+1
43
  if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44
    sys.stderr.write("Checkpoint after %d commits\n" % count)
45
    wr('checkpoint')
46
    wr()
47
  return count
48

    
49
def revnum_to_revref(rev, old_marks):
50
  """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51
  or a mark)"""
52
  return old_marks.get(rev) or ':%d' % (rev+1)
53

    
54
def file_mismatch(f1,f2):
55
  """See if two revisions of a file are not equal."""
56
  return node.hex(f1)!=node.hex(f2)
57

    
58
def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59
  """Loop over our repository and find all changed and missing files."""
60
  for left in dleft.keys():
61
    right=dright.get(left,None)
62
    if right==None:
63
      # we have the file but our parent hasn't: add to left set
64
      l.append(left)
65
    elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66
      # we have it but checksums mismatch: add to center set
67
      c.append(left)
68
  for right in dright.keys():
69
    left=dleft.get(right,None)
70
    if left==None:
71
      # if parent has file but we don't: add to right set
72
      r.append(right)
73
    # change is already handled when comparing child against parent
74
  return l,c,r
75

    
76
def get_filechanges(repo,revision,parents,mleft):
77
  """Given some repository and revision, find all changed/deleted files."""
78
  l,c,r=[],[],[]
79
  for p in parents:
80
    if p<0: continue
81
    mright=repo.changectx(p).manifest()
82
    l,c,r=split_dict(mleft,mright,l,c,r)
83
  l.sort()
84
  c.sort()
85
  r.sort()
86
  return l,c,r
87

    
88
def get_author(logmessage,committer,authors):
89
  """As git distincts between author and committer of a patch, try to
90
  extract author by detecting Signed-off-by lines.
91

92
  This walks from the end of the log message towards the top skipping
93
  empty lines. Upon the first non-empty line, it walks all Signed-off-by
94
  lines upwards to find the first one. For that (if found), it extracts
95
  authorship information the usual way (authors table, cleaning, etc.)
96

97
  If no Signed-off-by line is found, this defaults to the committer.
98

99
  This may sound stupid (and it somehow is), but in log messages we
100
  accidentially may have lines in the middle starting with
101
  "Signed-off-by: foo" and thus matching our detection regex. Prevent
102
  that."""
103

    
104
  loglines=logmessage.split('\n')
105
  i=len(loglines)
106
  # from tail walk to top skipping empty lines
107
  while i>=0:
108
    i-=1
109
    if len(loglines[i].strip())==0: continue
110
    break
111
  if i>=0:
112
    # walk further upwards to find first sob line, store in 'first'
113
    first=None
114
    while i>=0:
115
      m=sob_re.match(loglines[i])
116
      if m==None: break
117
      first=m
118
      i-=1
119
    # if the last non-empty line matches our Signed-Off-by regex: extract username
120
    if first!=None:
121
      r=fixup_user(first.group(1),authors)
122
      return r
123
  return committer
124

    
125
def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126
  count=0
127
  max=len(files)
128
  for file in files:
129
    # Skip .hgtags files. They only get us in trouble.
130
    if not hgtags and file == ".hgtags":
131
      sys.stderr.write('Skip %s\n' % (file))
132
      continue
133
    d=ctx.filectx(file).data()
134
    if encoding:
135
      filename=file.decode(encoding).encode('utf8')
136
    else:
137
      filename=file
138
    wr('M %s inline %s' % (gitmode(manifest.flags(file)),
139
                           strip_leading_slash(filename)))
140
    wr('data %d' % len(d)) # had some trouble with size()
141
    wr(d)
142
    count+=1
143
    if count%cfg_export_boundary==0:
144
      sys.stderr.write('Exported %d/%d files\n' % (count,max))
145
  if max>cfg_export_boundary:
146
    sys.stderr.write('Exported %d/%d files\n' % (count,max))
147

    
148
def sanitize_name(name,what="branch"):
149
  """Sanitize input roughly according to git-check-ref-format(1)"""
150

    
151
  def dot(name):
152
    if name[0] == '.': return '_'+name[1:]
153
    return name
154

    
155
  n=name
156
  p=re.compile('([[ ~^:?\\\\*]|\.\.)')
157
  n=p.sub('_', n)
158
  if n[-1] in ('/', '.'): n=n[:-1]+'_'
159
  n='/'.join(map(dot,n.split('/')))
160
  p=re.compile('_+')
161
  n=p.sub('_', n)
162

    
163
  if n!=name:
164
    sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
165
  return n
166

    
167
def strip_leading_slash(filename):
168
  if filename[0] == '/':
169
    return filename[1:]
170
  return filename
171

    
172
def export_commit(ui,repo,revision,old_marks,max,count,authors,
173
                  branchesmap,sob,brmap,hgtags,notes,encoding='',fn_encoding=''):
174
  def get_branchname(name):
175
    if brmap.has_key(name):
176
      return brmap[name]
177
    n=sanitize_name(branchesmap.get(name,name))
178
    brmap[name]=n
179
    return n
180

    
181
  (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
182

    
183
  branch=get_branchname(branch)
184

    
185
  parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
186

    
187
  if len(parents)==0 and revision != 0:
188
    wr('reset refs/heads/%s' % branch)
189

    
190
  wr('commit refs/heads/%s' % branch)
191
  wr('mark :%d' % (revision+1))
192
  if sob:
193
    wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
194
  wr('committer %s %d %s' % (user,time,timezone))
195
  wr('data %d' % (len(desc)+1)) # wtf?
196
  wr(desc)
197
  wr()
198

    
199
  ctx=repo.changectx(str(revision))
200
  man=ctx.manifest()
201
  added,changed,removed,type=[],[],[],''
202

    
203
  if len(parents) == 0:
204
    # first revision: feed in full manifest
205
    added=man.keys()
206
    added.sort()
207
    type='full'
208
  else:
209
    wr('from %s' % revnum_to_revref(parents[0], old_marks))
210
    if len(parents) == 1:
211
      # later non-merge revision: feed in changed manifest
212
      # if we have exactly one parent, just take the changes from the
213
      # manifest without expensively comparing checksums
214
      f=repo.status(repo.lookup(parents[0]),revnode)[:3]
215
      added,changed,removed=f[1],f[0],f[2]
216
      type='simple delta'
217
    else: # a merge with two parents
218
      wr('merge %s' % revnum_to_revref(parents[1], old_marks))
219
      # later merge revision: feed in changed manifest
220
      # for many files comparing checksums is expensive so only do it for
221
      # merges where we really need it due to hg's revlog logic
222
      added,changed,removed=get_filechanges(repo,revision,parents,man)
223
      type='thorough delta'
224

    
225
  sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
226
      (branch,type,revision+1,max,len(added),len(changed),len(removed)))
227

    
228
  if fn_encoding:
229
    removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
230

    
231
  removed=[strip_leading_slash(x) for x in removed]
232

    
233
  map(lambda r: wr('D %s' % r),removed)
234
  export_file_contents(ctx,man,added,hgtags,fn_encoding)
235
  export_file_contents(ctx,man,changed,hgtags,fn_encoding)
236
  wr()
237

    
238
  count=checkpoint(count)
239
  count=generate_note(user,time,timezone,revision,ctx,count,notes)
240
  return count
241

    
242
def generate_note(user,time,timezone,revision,ctx,count,notes):
243
  if not notes:
244
    return count
245
  wr('commit refs/notes/hg')
246
  wr('committer %s %d %s' % (user,time,timezone))
247
  wr('data 0')
248
  wr('N inline :%d' % (revision+1))
249
  hg_hash=ctx.hex()
250
  wr('data %d' % (len(hg_hash)))
251
  wr_no_nl(hg_hash)
252
  wr()
253
  return checkpoint(count)
254
  
255
def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
256
  l=repo.tagslist()
257
  for tag,node in l:
258
    # Remap the branch name
259
    tag=sanitize_name(tagsmap.get(tag,tag),"tag")
260
    # ignore latest revision
261
    if tag=='tip': continue
262
    # ignore tags to nodes that are missing (ie, 'in the future')
263
    if node.encode('hex_codec') not in mapping_cache:
264
      sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
265
      continue
266

    
267
    rev=int(mapping_cache[node.encode('hex_codec')])
268

    
269
    ref=revnum_to_revref(rev, old_marks)
270
    if ref==None:
271
      sys.stderr.write('Failed to find reference for creating tag'
272
          ' %s at r%d\n' % (tag,rev))
273
      continue
274
    sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
275
    wr('reset refs/tags/%s' % tag)
276
    wr('from %s' % ref)
277
    wr()
278
    count=checkpoint(count)
279
  return count
280

    
281
def load_mapping(name, filename):
282
  cache={}
283
  if not os.path.exists(filename):
284
    return cache
285
  f=open(filename,'r')
286
  l=0
287
  a=0
288
  lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
289
  for line in f.readlines():
290
    l+=1
291
    line=line.strip()
292
    if line=='' or line[0]=='#':
293
      continue
294
    m=lre.match(line)
295
    if m==None:
296
      sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
297
      continue
298
    # put key:value in cache, key without ^:
299
    cache[m.group(1).strip()]=m.group(2).strip()
300
    a+=1
301
  f.close()
302
  sys.stderr.write('Loaded %d %s\n' % (a, name))
303
  return cache
304

    
305
def branchtip(repo, heads):
306
  '''return the tipmost branch head in heads'''
307
  tip = heads[-1]
308
  for h in reversed(heads):
309
    if 'close' not in repo.changelog.read(h)[5]:
310
      tip = h
311
      break
312
  return tip
313

    
314
def verify_heads(ui,repo,cache,force):
315
  branches={}
316
  for bn, heads in repo.branchmap().iteritems():
317
    branches[bn] = branchtip(repo, heads)
318
  l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
319
  l.sort()
320

    
321
  # get list of hg's branches to verify, don't take all git has
322
  for _,_,b in l:
323
    b=get_branch(b)
324
    sha1=get_git_sha1(b)
325
    c=cache.get(b)
326
    if sha1!=c:
327
      sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
328
        '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
329
      if not force: return False
330

    
331
  # verify that branch has exactly one head
332
  t={}
333
  for h in repo.heads():
334
    (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
335
    if t.get(branch,False):
336
      sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
337
          repo.changelog.rev(h))
338
      if not force: return False
339
    t[branch]=True
340

    
341
  return True
342

    
343
def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
344
           authors={},branchesmap={},tagsmap={},
345
           sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''):
346
  _max=int(m)
347

    
348
  old_marks=load_cache(marksfile,lambda s: int(s)-1)
349
  mapping_cache=load_cache(mappingfile)
350
  heads_cache=load_cache(headsfile)
351
  state_cache=load_cache(tipfile)
352

    
353
  ui,repo=setup_repo(repourl)
354

    
355
  if not verify_heads(ui,repo,heads_cache,force):
356
    return 1
357

    
358
  try:
359
    tip=repo.changelog.count()
360
  except AttributeError:
361
    tip=len(repo)
362

    
363
  min=int(state_cache.get('tip',0))
364
  max=_max
365
  if _max<0 or max>tip:
366
    max=tip
367

    
368
  for rev in range(0,max):
369
          (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
370
          mapping_cache[revnode.encode('hex_codec')] = str(rev)
371

    
372

    
373
  c=0
374
  brmap={}
375
  for rev in range(min,max):
376
    c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
377
                    sob,brmap,hgtags,notes,encoding,fn_encoding)
378

    
379
  state_cache['tip']=max
380
  state_cache['repo']=repourl
381
  save_cache(tipfile,state_cache)
382
  save_cache(mappingfile,mapping_cache)
383

    
384
  c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
385

    
386
  sys.stderr.write('Issued %d commands\n' % c)
387

    
388
  return 0
389

    
390
if __name__=='__main__':
391
  def bail(parser,opt):
392
    sys.stderr.write('Error: No %s option given\n' % opt)
393
    parser.print_help()
394
    sys.exit(2)
395

    
396
  parser=OptionParser()
397

    
398
  parser.add_option("-m","--max",type="int",dest="max",
399
      help="Maximum hg revision to import")
400
  parser.add_option("--mapping",dest="mappingfile",
401
      help="File to read last run's hg-to-git SHA1 mapping")
402
  parser.add_option("--marks",dest="marksfile",
403
      help="File to read git-fast-import's marks from")
404
  parser.add_option("--heads",dest="headsfile",
405
      help="File to read last run's git heads from")
406
  parser.add_option("--status",dest="statusfile",
407
      help="File to read status from")
408
  parser.add_option("-r","--repo",dest="repourl",
409
      help="URL of repo to import")
410
  parser.add_option("-s",action="store_true",dest="sob",
411
      default=False,help="Enable parsing Signed-off-by lines")
412
  parser.add_option("--hgtags",action="store_true",dest="hgtags",
413
      default=False,help="Enable exporting .hgtags files")
414
  parser.add_option("-A","--authors",dest="authorfile",
415
      help="Read authormap from AUTHORFILE")
416
  parser.add_option("-B","--branches",dest="branchesfile",
417
      help="Read branch map from BRANCHESFILE")
418
  parser.add_option("-T","--tags",dest="tagsfile",
419
      help="Read tags map from TAGSFILE")
420
  parser.add_option("-f","--force",action="store_true",dest="force",
421
      default=False,help="Ignore validation errors by force")
422
  parser.add_option("-M","--default-branch",dest="default_branch",
423
      help="Set the default branch")
424
  parser.add_option("-o","--origin",dest="origin_name",
425
      help="use <name> as namespace to track upstream")
426
  parser.add_option("--hg-hash",action="store_true",dest="notes",
427
      default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
428
  parser.add_option("-e",dest="encoding",
429
      help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
430
  parser.add_option("--fe",dest="fn_encoding",
431
      help="Assume file names from Mercurial are encoded in <filename_encoding>")
432

    
433
  (options,args)=parser.parse_args()
434

    
435
  m=-1
436
  if options.max!=None: m=options.max
437

    
438
  if options.marksfile==None: bail(parser,'--marks')
439
  if options.mappingfile==None: bail(parser,'--mapping')
440
  if options.headsfile==None: bail(parser,'--heads')
441
  if options.statusfile==None: bail(parser,'--status')
442
  if options.repourl==None: bail(parser,'--repo')
443

    
444
  a={}
445
  if options.authorfile!=None:
446
    a=load_mapping('authors', options.authorfile)
447

    
448
  b={}
449
  if options.branchesfile!=None:
450
    b=load_mapping('branches', options.branchesfile)
451

    
452
  t={}
453
  if options.tagsfile!=None:
454
    t=load_mapping('tags', options.tagsfile)
455

    
456
  if options.default_branch!=None:
457
    set_default_branch(options.default_branch)
458

    
459
  if options.origin_name!=None:
460
    set_origin_name(options.origin_name)
461

    
462
  encoding=''
463
  if options.encoding!=None:
464
    encoding=options.encoding
465

    
466
  fn_encoding=encoding
467
  if options.fn_encoding!=None:
468
    fn_encoding=options.fn_encoding
469

    
470
  sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
471
                  options.headsfile, options.statusfile,
472
                  authors=a,branchesmap=b,tagsmap=t,
473
                  sob=options.sob,force=options.force,hgtags=options.hgtags,
474
                  notes=options.notes,encoding=encoding,fn_encoding=fn_encoding))