annotate extra/soundsoftware/get-apache-log-stats.rb @ 975:198f764e734c cannam

Add Apache log extraction script
author Chris Cannam
date Thu, 25 Oct 2012 13:38:34 +0100
parents
children bbb88c44f805
rev   line source
Chris@975 1
Chris@975 2 # Use the ApacheLogRegex parser, a neat thing
Chris@975 3 # See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
Chris@975 4 require 'apachelogregex'
Chris@975 5
Chris@975 6 # This is the format defined in our httpd.conf
Chris@975 7 vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
Chris@975 8
Chris@975 9 parser = ApacheLogRegex.new(vhost_combined_format)
Chris@975 10
Chris@975 11 # project name -> count of hg clones
Chris@975 12 clones = Hash.new(0)
Chris@975 13
Chris@975 14 # project name -> count of hg pulls
Chris@975 15 pulls = Hash.new(0)
Chris@975 16
Chris@975 17 # project name -> count of hg commits
Chris@975 18 commits = Hash.new(0)
Chris@975 19
Chris@975 20 # project name -> count of hg archive requests (i.e. Download as Zip)
Chris@975 21 zips = Hash.new(0)
Chris@975 22
Chris@975 23 # project name -> count of hits to pages under /projects/projectname
Chris@975 24 hits = Hash.new(0)
Chris@975 25
Chris@975 26 parseable = 0
Chris@975 27 unparseable = 0
Chris@975 28
Chris@975 29 ARGF.each do |line|
Chris@975 30
Chris@975 31 record = parser.parse(line)
Chris@975 32
Chris@975 33 # most annoyingly, the parser can't handle the comma-separated list
Chris@975 34 # in X-Forwarded-For where it has more than one element. If it has
Chris@975 35 # failed, remove any IP addresses with trailing commas and try again
Chris@975 36 if not record
Chris@975 37 filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "")
Chris@975 38 record = parser.parse(filtered)
Chris@975 39 end
Chris@975 40
Chris@975 41 # discard, but count, unparseable lines
Chris@975 42 if not record
Chris@975 43 unparseable += 1
Chris@975 44 next
Chris@975 45 end
Chris@975 46
Chris@975 47 # discard everything that isn't a 200 OK response
Chris@975 48 next if record["%>s"] != "200"
Chris@975 49
Chris@975 50 # discard anything apparently requested by a crawler
Chris@975 51 next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
Chris@975 52
Chris@975 53 # pull out request e.g. GET / HTTP/1.0
Chris@975 54 request = record["%r"]
Chris@975 55
Chris@975 56 # split into method, path, protocol
Chris@975 57 if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
Chris@975 58 unparseable += 1
Chris@975 59 next
Chris@975 60 end
Chris@975 61
Chris@975 62 # get the path e.g. /projects/weevilmatic and split on /
Chris@975 63 path = $~[1]
Chris@975 64 components = path.split("/")
Chris@975 65
Chris@975 66 # should have at least two elements unless path is "/"; first should
Chris@975 67 # be empty (begins with /)
Chris@975 68 if path != "/" and (components.size < 2 or components[0] != "")
Chris@975 69 unparseable += 1
Chris@975 70 next
Chris@975 71 end
Chris@975 72
Chris@975 73 if components[1] == "hg"
Chris@975 74
Chris@975 75 # path is /hg/project?something or /hg/project/something
Chris@975 76
Chris@975 77 project = components[2].split("?")[0]
Chris@975 78
Chris@975 79 if components[2] =~ /&roots=00*$/
Chris@975 80 clones[project] += 1
Chris@975 81 elsif components[2] =~ /cmd=capabilities/
Chris@975 82 pulls[project] += 1
Chris@975 83 elsif components[3] == "archive"
Chris@975 84 zips[project] += 1
Chris@975 85 end
Chris@975 86
Chris@975 87 elsif components[1] == "projects"
Chris@975 88
Chris@975 89 # path is /projects/project or /projects/project/something
Chris@975 90
Chris@975 91 project = components[2]
Chris@975 92 if project
Chris@975 93 project = project.split("?")[0]
Chris@975 94 hits[project] += 1
Chris@975 95 end
Chris@975 96
Chris@975 97 end
Chris@975 98
Chris@975 99 parseable += 1
Chris@975 100 end
Chris@975 101
Chris@975 102 # Each clone is also a pull; deduct it from the pulls hash, because we
Chris@975 103 # want that to contain only non-clone pulls
Chris@975 104
Chris@975 105 clones.keys.each do |project|
Chris@975 106 pulls[project] -= 1
Chris@975 107 end
Chris@975 108
Chris@975 109 print clones, "\n"
Chris@975 110 print pulls, "\n"
Chris@975 111 print zips, "\n"
Chris@975 112 print hits, "\n"
Chris@975 113
Chris@975 114 print parseable, " parseable\n"
Chris@975 115 print unparseable, " unparseable\n"
Chris@975 116