Chris@975: Chris@975: # Use the ApacheLogRegex parser, a neat thing Chris@975: # See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/ Chris@975: require 'apachelogregex' Chris@975: Chris@975: # This is the format defined in our httpd.conf Chris@975: vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"' Chris@975: Chris@975: parser = ApacheLogRegex.new(vhost_combined_format) Chris@975: Chris@975: # project name -> count of hg clones Chris@975: clones = Hash.new(0) Chris@975: Chris@975: # project name -> count of hg pulls Chris@975: pulls = Hash.new(0) Chris@975: Chris@975: # project name -> count of hg commits Chris@975: commits = Hash.new(0) Chris@975: Chris@975: # project name -> count of hg archive requests (i.e. Download as Zip) Chris@975: zips = Hash.new(0) Chris@975: Chris@975: # project name -> count of hits to pages under /projects/projectname Chris@975: hits = Hash.new(0) Chris@975: Chris@975: parseable = 0 Chris@975: unparseable = 0 Chris@975: Chris@975: ARGF.each do |line| Chris@975: Chris@975: record = parser.parse(line) Chris@975: Chris@975: # most annoyingly, the parser can't handle the comma-separated list Chris@975: # in X-Forwarded-For where it has more than one element. If it has Chris@975: # failed, remove any IP addresses with trailing commas and try again Chris@975: if not record Chris@975: filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "") Chris@975: record = parser.parse(filtered) Chris@975: end Chris@975: Chris@975: # discard, but count, unparseable lines Chris@975: if not record Chris@975: unparseable += 1 Chris@975: next Chris@975: end Chris@975: Chris@975: # discard everything that isn't a 200 OK response Chris@975: next if record["%>s"] != "200" Chris@975: Chris@975: # discard anything apparently requested by a crawler Chris@975: next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i Chris@975: Chris@975: # pull out request e.g. GET / HTTP/1.0 Chris@975: request = record["%r"] Chris@975: Chris@975: # split into method, path, protocol Chris@975: if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/ Chris@975: unparseable += 1 Chris@975: next Chris@975: end Chris@975: Chris@975: # get the path e.g. /projects/weevilmatic and split on / Chris@975: path = $~[1] Chris@975: components = path.split("/") Chris@975: Chris@975: # should have at least two elements unless path is "/"; first should Chris@975: # be empty (begins with /) Chris@975: if path != "/" and (components.size < 2 or components[0] != "") Chris@975: unparseable += 1 Chris@975: next Chris@975: end Chris@975: Chris@975: if components[1] == "hg" Chris@975: Chris@975: # path is /hg/project?something or /hg/project/something Chris@975: Chris@975: project = components[2].split("?")[0] Chris@975: Chris@975: if components[2] =~ /&roots=00*$/ Chris@975: clones[project] += 1 Chris@975: elsif components[2] =~ /cmd=capabilities/ Chris@975: pulls[project] += 1 Chris@975: elsif components[3] == "archive" Chris@975: zips[project] += 1 Chris@975: end Chris@975: Chris@975: elsif components[1] == "projects" Chris@975: Chris@975: # path is /projects/project or /projects/project/something Chris@975: Chris@975: project = components[2] Chris@975: if project Chris@975: project = project.split("?")[0] Chris@975: hits[project] += 1 Chris@975: end Chris@975: Chris@975: end Chris@975: Chris@975: parseable += 1 Chris@975: end Chris@975: Chris@975: # Each clone is also a pull; deduct it from the pulls hash, because we Chris@975: # want that to contain only non-clone pulls Chris@975: Chris@975: clones.keys.each do |project| Chris@975: pulls[project] -= 1 Chris@975: end Chris@975: Chris@975: print clones, "\n" Chris@975: print pulls, "\n" Chris@975: print zips, "\n" Chris@975: print hits, "\n" Chris@975: Chris@975: print parseable, " parseable\n" Chris@975: print unparseable, " unparseable\n" Chris@975: