To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / extra / fast-export / hg-fast-export.py @ 1590:c18460da6620

History | View | Annotate | Download (16.5 KB)

1
#!/usr/bin/env python
2

    
3
# Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4
# License: MIT <http://www.opensource.org/licenses/mit-license.php>
5

    
6
from mercurial import node
7
from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8
from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9
from optparse import OptionParser
10
import re
11
import sys
12
import os
13

    
14
if sys.platform == "win32":
15
  # On Windows, sys.stdout is initially opened in text mode, which means that
16
  # when a LF (\n) character is written to sys.stdout, it will be converted
17
  # into CRLF (\r\n).  That makes git blow up, so use this platform-specific
18
  # code to change the mode of sys.stdout to binary.
19
  import msvcrt
20
  msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
21

    
22
# silly regex to catch Signed-off-by lines in log message
23
sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24
# insert 'checkpoint' command after this many commits or none at all if 0
25
cfg_checkpoint_count=0
26
# write some progress message every this many file contents written
27
cfg_export_boundary=1000
28

    
29
def gitmode(flags):
30
  return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
31

    
32
def wr_no_nl(msg=''):
33
  if msg:
34
    sys.stdout.write(msg)
35

    
36
def wr(msg=''):
37
  wr_no_nl(msg)
38
  sys.stdout.write('\n')
39
  #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
40

    
41
def checkpoint(count):
42
  count=count+1
43
  if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44
    sys.stderr.write("Checkpoint after %d commits\n" % count)
45
    wr('checkpoint')
46
    wr()
47
  return count
48

    
49
def revnum_to_revref(rev, old_marks):
50
  """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51
  or a mark)"""
52
  return old_marks.get(rev) or ':%d' % (rev+1)
53

    
54
def file_mismatch(f1,f2):
55
  """See if two revisions of a file are not equal."""
56
  return node.hex(f1)!=node.hex(f2)
57

    
58
def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59
  """Loop over our repository and find all changed and missing files."""
60
  for left in dleft.keys():
61
    right=dright.get(left,None)
62
    if right==None:
63
      # we have the file but our parent hasn't: add to left set
64
      l.append(left)
65
    elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66
      # we have it but checksums mismatch: add to center set
67
      c.append(left)
68
  for right in dright.keys():
69
    left=dleft.get(right,None)
70
    if left==None:
71
      # if parent has file but we don't: add to right set
72
      r.append(right)
73
    # change is already handled when comparing child against parent
74
  return l,c,r
75

    
76
def get_filechanges(repo,revision,parents,mleft):
77
  """Given some repository and revision, find all changed/deleted files."""
78
  l,c,r=[],[],[]
79
  for p in parents:
80
    if p<0: continue
81
    mright=repo.changectx(p).manifest()
82
    l,c,r=split_dict(mleft,mright,l,c,r)
83
  l.sort()
84
  c.sort()
85
  r.sort()
86
  return l,c,r
87

    
88
def get_author(logmessage,committer,authors):
89
  """As git distincts between author and committer of a patch, try to
90
  extract author by detecting Signed-off-by lines.
91

92
  This walks from the end of the log message towards the top skipping
93
  empty lines. Upon the first non-empty line, it walks all Signed-off-by
94
  lines upwards to find the first one. For that (if found), it extracts
95
  authorship information the usual way (authors table, cleaning, etc.)
96

97
  If no Signed-off-by line is found, this defaults to the committer.
98

99
  This may sound stupid (and it somehow is), but in log messages we
100
  accidentially may have lines in the middle starting with
101
  "Signed-off-by: foo" and thus matching our detection regex. Prevent
102
  that."""
103

    
104
  loglines=logmessage.split('\n')
105
  i=len(loglines)
106
  # from tail walk to top skipping empty lines
107
  while i>=0:
108
    i-=1
109
    if len(loglines[i].strip())==0: continue
110
    break
111
  if i>=0:
112
    # walk further upwards to find first sob line, store in 'first'
113
    first=None
114
    while i>=0:
115
      m=sob_re.match(loglines[i])
116
      if m==None: break
117
      first=m
118
      i-=1
119
    # if the last non-empty line matches our Signed-Off-by regex: extract username
120
    if first!=None:
121
      r=fixup_user(first.group(1),authors)
122
      return r
123
  return committer
124

    
125
def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126
  count=0
127
  max=len(files)
128
  for file in files:
129
    # Skip .hgtags files. They only get us in trouble.
130
    if not hgtags and file == ".hgtags":
131
      sys.stderr.write('Skip %s\n' % (file))
132
      continue
133
    d=ctx.filectx(file).data()
134
    if encoding:
135
      filename=file.decode(encoding).encode('utf8')
136
    else:
137
      filename=file
138
    wr('M %s inline %s' % (gitmode(manifest.flags(file)),
139
                           strip_leading_slash(filename)))
140
    wr('data %d' % len(d)) # had some trouble with size()
141
    wr(d)
142
    count+=1
143
    if count%cfg_export_boundary==0:
144
      sys.stderr.write('Exported %d/%d files\n' % (count,max))
145
  if max>cfg_export_boundary:
146
    sys.stderr.write('Exported %d/%d files\n' % (count,max))
147

    
148
def sanitize_name(name,what="branch", mapping={}):
149
  """Sanitize input roughly according to git-check-ref-format(1)"""
150

    
151
  # NOTE: Do not update this transform to work around
152
  # incompatibilities on your platform. If you change it and it starts
153
  # modifying names which previously were not touched it will break
154
  # preexisting setups which are doing incremental imports.
155
  #
156
  # Use the -B and -T options to mangle branch and tag names
157
  # instead. If you have a source repository where this is too much
158
  # work to do manually, write a tool that does it for you.
159

    
160
  def dot(name):
161
    if name[0] == '.': return '_'+name[1:]
162
    return name
163

    
164
  n=mapping.get(name,name)
165
  p=re.compile('([[ ~^:?\\\\*]|\.\.)')
166
  n=p.sub('_', n)
167
  if n[-1] in ('/', '.'): n=n[:-1]+'_'
168
  n='/'.join(map(dot,n.split('/')))
169
  p=re.compile('_+')
170
  n=p.sub('_', n)
171

    
172
  if n!=name:
173
    sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
174
  return n
175

    
176
def strip_leading_slash(filename):
177
  if filename[0] == '/':
178
    return filename[1:]
179
  return filename
180

    
181
def export_commit(ui,repo,revision,old_marks,max,count,authors,
182
                  branchesmap,sob,brmap,hgtags,encoding='',fn_encoding=''):
183
  def get_branchname(name):
184
    if brmap.has_key(name):
185
      return brmap[name]
186
    n=sanitize_name(name, "branch", branchesmap)
187
    brmap[name]=n
188
    return n
189

    
190
  (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
191

    
192
  branch=get_branchname(branch)
193

    
194
  parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
195

    
196
  if len(parents)==0 and revision != 0:
197
    wr('reset refs/heads/%s' % branch)
198

    
199
  wr('commit refs/heads/%s' % branch)
200
  wr('mark :%d' % (revision+1))
201
  if sob:
202
    wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
203
  wr('committer %s %d %s' % (user,time,timezone))
204
  wr('data %d' % (len(desc)+1)) # wtf?
205
  wr(desc)
206
  wr()
207

    
208
  ctx=repo.changectx(str(revision))
209
  man=ctx.manifest()
210
  added,changed,removed,type=[],[],[],''
211

    
212
  if len(parents) == 0:
213
    # first revision: feed in full manifest
214
    added=man.keys()
215
    added.sort()
216
    type='full'
217
  else:
218
    wr('from %s' % revnum_to_revref(parents[0], old_marks))
219
    if len(parents) == 1:
220
      # later non-merge revision: feed in changed manifest
221
      # if we have exactly one parent, just take the changes from the
222
      # manifest without expensively comparing checksums
223
      f=repo.status(repo.lookup(parents[0]),revnode)[:3]
224
      added,changed,removed=f[1],f[0],f[2]
225
      type='simple delta'
226
    else: # a merge with two parents
227
      wr('merge %s' % revnum_to_revref(parents[1], old_marks))
228
      # later merge revision: feed in changed manifest
229
      # for many files comparing checksums is expensive so only do it for
230
      # merges where we really need it due to hg's revlog logic
231
      added,changed,removed=get_filechanges(repo,revision,parents,man)
232
      type='thorough delta'
233

    
234
  sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
235
      (branch,type,revision+1,max,len(added),len(changed),len(removed)))
236

    
237
  if fn_encoding:
238
    removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
239

    
240
  removed=[strip_leading_slash(x) for x in removed]
241

    
242
  map(lambda r: wr('D %s' % r),removed)
243
  export_file_contents(ctx,man,added,hgtags,fn_encoding)
244
  export_file_contents(ctx,man,changed,hgtags,fn_encoding)
245
  wr()
246

    
247
  return checkpoint(count)
248

    
249
def export_note(ui,repo,revision,count,authors,encoding,is_first):
250
  (revnode,_,user,(time,timezone),_,_,_,_)=get_changeset(ui,repo,revision,authors,encoding)
251

    
252
  parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
253

    
254
  wr('commit refs/notes/hg')
255
  wr('committer %s %d %s' % (user,time,timezone))
256
  wr('data 0')
257
  if is_first:
258
    wr('from refs/notes/hg^0')
259
  wr('N inline :%d' % (revision+1))
260
  hg_hash=repo.changectx(str(revision)).hex()
261
  wr('data %d' % (len(hg_hash)))
262
  wr_no_nl(hg_hash)
263
  wr()
264
  return checkpoint(count)
265

    
266
  wr('data %d' % (len(desc)+1)) # wtf?
267
  wr(desc)
268
  wr()
269

    
270
def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
271
  l=repo.tagslist()
272
  for tag,node in l:
273
    # Remap the branch name
274
    tag=sanitize_name(tag,"tag",tagsmap)
275
    # ignore latest revision
276
    if tag=='tip': continue
277
    # ignore tags to nodes that are missing (ie, 'in the future')
278
    if node.encode('hex_codec') not in mapping_cache:
279
      sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
280
      continue
281

    
282
    rev=int(mapping_cache[node.encode('hex_codec')])
283

    
284
    ref=revnum_to_revref(rev, old_marks)
285
    if ref==None:
286
      sys.stderr.write('Failed to find reference for creating tag'
287
          ' %s at r%d\n' % (tag,rev))
288
      continue
289
    sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
290
    wr('reset refs/tags/%s' % tag)
291
    wr('from %s' % ref)
292
    wr()
293
    count=checkpoint(count)
294
  return count
295

    
296
def load_mapping(name, filename):
297
  cache={}
298
  if not os.path.exists(filename):
299
    sys.stderr.write('Could not open mapping file [%s]\n' % (filename))
300
    return cache
301
  f=open(filename,'r')
302
  l=0
303
  a=0
304
  lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
305
  for line in f.readlines():
306
    l+=1
307
    line=line.strip()
308
    if line=='' or line[0]=='#':
309
      continue
310
    m=lre.match(line)
311
    if m==None:
312
      sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
313
      continue
314
    # put key:value in cache, key without ^:
315
    cache[m.group(1).strip()]=m.group(2).strip()
316
    a+=1
317
  f.close()
318
  sys.stderr.write('Loaded %d %s\n' % (a, name))
319
  return cache
320

    
321
def branchtip(repo, heads):
322
  '''return the tipmost branch head in heads'''
323
  tip = heads[-1]
324
  for h in reversed(heads):
325
    if 'close' not in repo.changelog.read(h)[5]:
326
      tip = h
327
      break
328
  return tip
329

    
330
def verify_heads(ui,repo,cache,force,branchesmap):
331
  branches={}
332
  for bn, heads in repo.branchmap().iteritems():
333
    branches[bn] = branchtip(repo, heads)
334
  l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
335
  l.sort()
336

    
337
  # get list of hg's branches to verify, don't take all git has
338
  for _,_,b in l:
339
    b=get_branch(b)
340
    sanitized_name=sanitize_name(b,"branch",branchesmap)
341
    sha1=get_git_sha1(sanitized_name)
342
    c=cache.get(sanitized_name)
343
    if sha1!=c:
344
      sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
345
        '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
346
      if not force: return False
347

    
348
  # verify that branch has exactly one head
349
  t={}
350
  for h in repo.heads():
351
    (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
352
    if t.get(branch,False):
353
      sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
354
          repo.changelog.rev(h))
355
      if not force: return False
356
    t[branch]=True
357

    
358
  return True
359

    
360
def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
361
           authors={},branchesmap={},tagsmap={},
362
           sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''):
363
  def check_cache(filename, contents):
364
    if len(contents) == 0:
365
      sys.stderr.write('Warning: %s does not contain any data, this will probably make an incremental import fail\n' % filename)
366

    
367
  _max=int(m)
368

    
369
  old_marks=load_cache(marksfile,lambda s: int(s)-1)
370
  mapping_cache=load_cache(mappingfile)
371
  heads_cache=load_cache(headsfile)
372
  state_cache=load_cache(tipfile)
373

    
374
  if len(state_cache) != 0:
375
    for (name, data) in [(marksfile, old_marks),
376
                         (mappingfile, mapping_cache),
377
                         (headsfile, state_cache)]:
378
      check_cache(name, data)
379

    
380
  ui,repo=setup_repo(repourl)
381

    
382
  if not verify_heads(ui,repo,heads_cache,force,branchesmap):
383
    return 1
384

    
385
  try:
386
    tip=repo.changelog.count()
387
  except AttributeError:
388
    tip=len(repo)
389

    
390
  min=int(state_cache.get('tip',0))
391
  max=_max
392
  if _max<0 or max>tip:
393
    max=tip
394

    
395
  for rev in range(0,max):
396
          (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
397
          mapping_cache[revnode.encode('hex_codec')] = str(rev)
398

    
399

    
400
  c=0
401
  brmap={}
402
  for rev in range(min,max):
403
    c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
404
                    sob,brmap,hgtags,encoding,fn_encoding)
405
  if notes:
406
    for rev in range(min,max):
407
      c=export_note(ui,repo,rev,c,authors, encoding, rev == min and min != 0)
408

    
409
  state_cache['tip']=max
410
  state_cache['repo']=repourl
411
  save_cache(tipfile,state_cache)
412
  save_cache(mappingfile,mapping_cache)
413

    
414
  c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
415

    
416
  sys.stderr.write('Issued %d commands\n' % c)
417

    
418
  return 0
419

    
420
if __name__=='__main__':
421
  def bail(parser,opt):
422
    sys.stderr.write('Error: No %s option given\n' % opt)
423
    parser.print_help()
424
    sys.exit(2)
425

    
426
  parser=OptionParser()
427

    
428
  parser.add_option("-m","--max",type="int",dest="max",
429
      help="Maximum hg revision to import")
430
  parser.add_option("--mapping",dest="mappingfile",
431
      help="File to read last run's hg-to-git SHA1 mapping")
432
  parser.add_option("--marks",dest="marksfile",
433
      help="File to read git-fast-import's marks from")
434
  parser.add_option("--heads",dest="headsfile",
435
      help="File to read last run's git heads from")
436
  parser.add_option("--status",dest="statusfile",
437
      help="File to read status from")
438
  parser.add_option("-r","--repo",dest="repourl",
439
      help="URL of repo to import")
440
  parser.add_option("-s",action="store_true",dest="sob",
441
      default=False,help="Enable parsing Signed-off-by lines")
442
  parser.add_option("--hgtags",action="store_true",dest="hgtags",
443
      default=False,help="Enable exporting .hgtags files")
444
  parser.add_option("-A","--authors",dest="authorfile",
445
      help="Read authormap from AUTHORFILE")
446
  parser.add_option("-B","--branches",dest="branchesfile",
447
      help="Read branch map from BRANCHESFILE")
448
  parser.add_option("-T","--tags",dest="tagsfile",
449
      help="Read tags map from TAGSFILE")
450
  parser.add_option("-f","--force",action="store_true",dest="force",
451
      default=False,help="Ignore validation errors by force")
452
  parser.add_option("-M","--default-branch",dest="default_branch",
453
      help="Set the default branch")
454
  parser.add_option("-o","--origin",dest="origin_name",
455
      help="use <name> as namespace to track upstream")
456
  parser.add_option("--hg-hash",action="store_true",dest="notes",
457
      default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
458
  parser.add_option("-e",dest="encoding",
459
      help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
460
  parser.add_option("--fe",dest="fn_encoding",
461
      help="Assume file names from Mercurial are encoded in <filename_encoding>")
462

    
463
  (options,args)=parser.parse_args()
464

    
465
  m=-1
466
  if options.max!=None: m=options.max
467

    
468
  if options.marksfile==None: bail(parser,'--marks')
469
  if options.mappingfile==None: bail(parser,'--mapping')
470
  if options.headsfile==None: bail(parser,'--heads')
471
  if options.statusfile==None: bail(parser,'--status')
472
  if options.repourl==None: bail(parser,'--repo')
473

    
474
  a={}
475
  if options.authorfile!=None:
476
    a=load_mapping('authors', options.authorfile)
477

    
478
  b={}
479
  if options.branchesfile!=None:
480
    b=load_mapping('branches', options.branchesfile)
481

    
482
  t={}
483
  if options.tagsfile!=None:
484
    t=load_mapping('tags', options.tagsfile)
485

    
486
  if options.default_branch!=None:
487
    set_default_branch(options.default_branch)
488

    
489
  if options.origin_name!=None:
490
    set_origin_name(options.origin_name)
491

    
492
  encoding=''
493
  if options.encoding!=None:
494
    encoding=options.encoding
495

    
496
  fn_encoding=encoding
497
  if options.fn_encoding!=None:
498
    fn_encoding=options.fn_encoding
499

    
500
  sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
501
                  options.headsfile, options.statusfile,
502
                  authors=a,branchesmap=b,tagsmap=t,
503
                  sob=options.sob,force=options.force,hgtags=options.hgtags,
504
                  notes=options.notes,encoding=encoding,fn_encoding=fn_encoding))