Chris@975: 
Chris@975: # Use the ApacheLogRegex parser, a neat thing
Chris@975: # See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
Chris@975: require 'apachelogregex'
Chris@975: 
Chris@975: # This is the format defined in our httpd.conf
Chris@975: vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
Chris@975: 
Chris@975: parser = ApacheLogRegex.new(vhost_combined_format)
Chris@975: 
Chris@975: # project name -> count of hg clones
Chris@975: clones = Hash.new(0)
Chris@975: 
Chris@975: # project name -> count of hg pulls
Chris@975: pulls = Hash.new(0)
Chris@975: 
Chris@975: # project name -> count of hg commits
Chris@975: commits = Hash.new(0)
Chris@975: 
Chris@975: # project name -> count of hg archive requests (i.e. Download as Zip)
Chris@975: zips = Hash.new(0)
Chris@975: 
Chris@975: # project name -> count of hits to pages under /projects/projectname
Chris@975: hits = Hash.new(0)
Chris@975: 
Chris@975: parseable = 0
Chris@975: unparseable = 0
Chris@975: 
Chris@975: ARGF.each do |line|
Chris@975: 
Chris@975:   record = parser.parse(line)
Chris@975: 
Chris@975:   # most annoyingly, the parser can't handle the comma-separated list
Chris@975:   # in X-Forwarded-For where it has more than one element. If it has
Chris@975:   # failed, remove any IP addresses with trailing commas and try again
Chris@975:   if not record
Chris@975:     filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "")
Chris@975:     record = parser.parse(filtered)
Chris@975:   end
Chris@975: 
Chris@975:   # discard, but count, unparseable lines
Chris@975:   if not record
Chris@975:     unparseable += 1
Chris@975:     next
Chris@975:   end
Chris@975: 
Chris@975:   # discard everything that isn't a 200 OK response
Chris@975:   next if record["%>s"] != "200"
Chris@975: 
Chris@975:   # discard anything apparently requested by a crawler
Chris@975:   next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
Chris@975: 
Chris@975:   # pull out request e.g. GET / HTTP/1.0
Chris@975:   request = record["%r"]
Chris@975: 
Chris@975:   # split into method, path, protocol
Chris@975:   if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
Chris@975:     unparseable += 1
Chris@975:     next
Chris@975:   end
Chris@975: 
Chris@975:   # get the path e.g. /projects/weevilmatic and split on /
Chris@975:   path = $~[1]
Chris@975:   components = path.split("/")
Chris@975:   
Chris@975:   # should have at least two elements unless path is "/"; first should
Chris@975:   # be empty (begins with /)
Chris@975:   if path != "/" and (components.size < 2 or components[0] != "")
Chris@975:     unparseable += 1
Chris@975:     next
Chris@975:   end
Chris@975: 
Chris@975:   if components[1] == "hg"
Chris@975:     
Chris@975:     # path is /hg/project?something or /hg/project/something
Chris@975: 
Chris@975:     project = components[2].split("?")[0]
Chris@975: 
Chris@975:     if components[2] =~ /&roots=00*$/
Chris@975:       clones[project] += 1
Chris@975:     elsif components[2] =~ /cmd=capabilities/
Chris@975:       pulls[project] += 1
Chris@975:     elsif components[3] == "archive"
Chris@975:       zips[project] += 1
Chris@975:     end
Chris@975: 
Chris@975:   elsif components[1] == "projects"
Chris@975: 
Chris@975:     # path is /projects/project or /projects/project/something
Chris@975: 
Chris@975:     project = components[2]
Chris@975:     if project
Chris@975:       project = project.split("?")[0]
Chris@975:       hits[project] += 1
Chris@975:     end
Chris@975: 
Chris@975:   end
Chris@975: 
Chris@975:   parseable += 1
Chris@975: end
Chris@975: 
Chris@975: # Each clone is also a pull; deduct it from the pulls hash, because we
Chris@975: # want that to contain only non-clone pulls
Chris@975: 
Chris@975: clones.keys.each do |project|
Chris@975:   pulls[project] -= 1
Chris@975: end
Chris@975: 
Chris@975: print clones, "\n"
Chris@975: print pulls, "\n"
Chris@975: print zips, "\n"
Chris@975: print hits, "\n"
Chris@975: 
Chris@975: print parseable, " parseable\n"
Chris@975: print unparseable, " unparseable\n"
Chris@975: