annotate extra/soundsoftware/get-apache-log-stats.rb @ 979:56a38a9f6204 cannam

Various fixes
author Chris Cannam
date Thu, 25 Oct 2012 14:08:58 +0100
parents bbb88c44f805
children 9b4919de5317
rev   line source
Chris@978 1
Chris@978 2 # Read an Apache log file from the SoundSoftware site and produce some
Chris@978 3 # per-project stats.
Chris@978 4 #
Chris@978 5 # Invoke with e.g.
Chris@978 6 #
Chris@978 7 # cat /var/log/apache2/code-access.log | \
Chris@978 8 # script/runner -e production extra/soundsoftware/get-apache-log-stats.rb
Chris@978 9
Chris@975 10
Chris@975 11 # Use the ApacheLogRegex parser, a neat thing
Chris@975 12 # See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
Chris@975 13 require 'apachelogregex'
Chris@975 14
Chris@975 15 # This is the format defined in our httpd.conf
Chris@975 16 vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
Chris@975 17
Chris@975 18 parser = ApacheLogRegex.new(vhost_combined_format)
Chris@975 19
Chris@975 20 # project name -> count of hg clones
Chris@975 21 clones = Hash.new(0)
Chris@975 22
Chris@975 23 # project name -> count of hg pulls
Chris@975 24 pulls = Hash.new(0)
Chris@975 25
Chris@978 26 # project name -> count of hg pushes
Chris@978 27 pushes = Hash.new(0)
Chris@975 28
Chris@975 29 # project name -> count of hg archive requests (i.e. Download as Zip)
Chris@975 30 zips = Hash.new(0)
Chris@975 31
Chris@975 32 # project name -> count of hits to pages under /projects/projectname
Chris@975 33 hits = Hash.new(0)
Chris@975 34
Chris@978 35 # project name -> Project object
Chris@978 36 @projects = Hash.new
Chris@978 37
Chris@975 38 parseable = 0
Chris@975 39 unparseable = 0
Chris@975 40
Chris@979 41 def is_public_project?(project)
Chris@978 42 if !project
Chris@978 43 false
Chris@978 44 elsif @projects.key?(project)
Chris@979 45 @projects[project].is_public?
Chris@978 46 else
Chris@978 47 pobj = Project.find_by_identifier(project)
Chris@978 48 if pobj
Chris@978 49 @projects[project] = pobj
Chris@979 50 pobj.is_public?
Chris@978 51 else
Chris@979 52 print "Project not found: ", project, "\n"
Chris@978 53 false
Chris@978 54 end
Chris@978 55 end
Chris@978 56 end
Chris@978 57
Chris@979 58 STDIN.each do |line|
Chris@975 59
Chris@975 60 record = parser.parse(line)
Chris@975 61
Chris@975 62 # most annoyingly, the parser can't handle the comma-separated list
Chris@975 63 # in X-Forwarded-For where it has more than one element. If it has
Chris@975 64 # failed, remove any IP addresses with trailing commas and try again
Chris@975 65 if not record
Chris@975 66 filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "")
Chris@975 67 record = parser.parse(filtered)
Chris@975 68 end
Chris@975 69
Chris@975 70 # discard, but count, unparseable lines
Chris@975 71 if not record
Chris@979 72 print "Line not parseable: ", line, "\n"
Chris@975 73 unparseable += 1
Chris@975 74 next
Chris@975 75 end
Chris@975 76
Chris@975 77 # discard everything that isn't a 200 OK response
Chris@975 78 next if record["%>s"] != "200"
Chris@975 79
Chris@975 80 # discard anything apparently requested by a crawler
Chris@975 81 next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
Chris@975 82
Chris@975 83 # pull out request e.g. GET / HTTP/1.0
Chris@975 84 request = record["%r"]
Chris@975 85
Chris@975 86 # split into method, path, protocol
Chris@975 87 if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
Chris@979 88 print "Line not parseable (bad method, path, protocol): ", line, "\n"
Chris@975 89 unparseable += 1
Chris@975 90 next
Chris@975 91 end
Chris@975 92
Chris@975 93 # get the path e.g. /projects/weevilmatic and split on /
Chris@975 94 path = $~[1]
Chris@975 95 components = path.split("/")
Chris@975 96
Chris@975 97 # should have at least two elements unless path is "/"; first should
Chris@975 98 # be empty (begins with /)
Chris@975 99 if path != "/" and (components.size < 2 or components[0] != "")
Chris@979 100 print "Line not parseable (degenerate path): ", line, "\n"
Chris@975 101 unparseable += 1
Chris@975 102 next
Chris@975 103 end
Chris@975 104
Chris@975 105 if components[1] == "hg"
Chris@975 106
Chris@975 107 # path is /hg/project?something or /hg/project/something
Chris@975 108
Chris@975 109 project = components[2].split("?")[0]
Chris@979 110 if not is_public_project?(project)
Chris@978 111 next
Chris@978 112 end
Chris@975 113
Chris@975 114 if components[2] =~ /&roots=00*$/
Chris@975 115 clones[project] += 1
Chris@975 116 elsif components[2] =~ /cmd=capabilities/
Chris@975 117 pulls[project] += 1
Chris@978 118 elsif components[2] =~ /cmd=unbundle/
Chris@978 119 pushes[project] += 1
Chris@975 120 elsif components[3] == "archive"
Chris@975 121 zips[project] += 1
Chris@975 122 end
Chris@975 123
Chris@975 124 elsif components[1] == "projects"
Chris@975 125
Chris@975 126 # path is /projects/project or /projects/project/something
Chris@975 127
Chris@975 128 project = components[2]
Chris@979 129 project = project.split("?")[0] if project
Chris@979 130 if not is_public_project?(project)
Chris@978 131 next
Chris@975 132 end
Chris@975 133
Chris@978 134 project = project.split("?")[0]
Chris@978 135 hits[project] += 1
Chris@978 136
Chris@975 137 end
Chris@975 138
Chris@975 139 parseable += 1
Chris@975 140 end
Chris@975 141
Chris@975 142 # Each clone is also a pull; deduct it from the pulls hash, because we
Chris@975 143 # want that to contain only non-clone pulls
Chris@975 144
Chris@975 145 clones.keys.each do |project|
Chris@975 146 pulls[project] -= 1
Chris@975 147 end
Chris@975 148
Chris@975 149 print clones, "\n"
Chris@975 150 print pulls, "\n"
Chris@978 151 print pushes, "\n"
Chris@975 152 print zips, "\n"
Chris@975 153 print hits, "\n"
Chris@975 154
Chris@975 155 print parseable, " parseable\n"
Chris@975 156 print unparseable, " unparseable\n"
Chris@975 157