annotate extra/soundsoftware/get-apache-log-stats.rb @ 1628:9c5f8e24dadc live tip

Quieten this cron script
author Chris Cannam
date Tue, 25 Aug 2020 11:38:49 +0100
parents a97e573d7f87
children
rev   line source
Chris@978 1
Chris@980 2 # Read an Apache log file in SoundSoftware site format from stdin and
Chris@980 3 # produce some per-project stats.
Chris@978 4 #
Chris@978 5 # Invoke with e.g.
Chris@978 6 #
Chris@978 7 # cat /var/log/apache2/code-access.log | \
Chris@978 8 # script/runner -e production extra/soundsoftware/get-apache-log-stats.rb
Chris@978 9
Chris@975 10
Chris@975 11 # Use the ApacheLogRegex parser, a neat thing
Chris@975 12 # See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
Chris@975 13 require 'apachelogregex'
Chris@975 14
Chris@975 15 # This is the format defined in our httpd.conf
Chris@975 16 vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
Chris@975 17
Chris@975 18 parser = ApacheLogRegex.new(vhost_combined_format)
Chris@975 19
Chris@975 20 # project name -> count of hg clones
Chris@975 21 clones = Hash.new(0)
Chris@975 22
Chris@975 23 # project name -> count of hg pulls
Chris@975 24 pulls = Hash.new(0)
Chris@975 25
Chris@978 26 # project name -> count of hg pushes
Chris@978 27 pushes = Hash.new(0)
Chris@975 28
Chris@975 29 # project name -> count of hg archive requests (i.e. Download as Zip)
Chris@975 30 zips = Hash.new(0)
Chris@975 31
Chris@975 32 # project name -> count of hits to pages under /projects/projectname
Chris@975 33 hits = Hash.new(0)
Chris@975 34
Chris@978 35 # project name -> Project object
Chris@978 36 @projects = Hash.new
Chris@978 37
Chris@975 38 parseable = 0
Chris@975 39 unparseable = 0
Chris@975 40
Chris@979 41 def is_public_project?(project)
Chris@978 42 if !project
Chris@978 43 false
Chris@983 44 elsif project =~ /^\d+$/
Chris@983 45 # ignore numerical project ids, they are only used when editing projects
Chris@983 46 false
Chris@978 47 elsif @projects.key?(project)
Chris@979 48 @projects[project].is_public?
Chris@978 49 else
Chris@978 50 pobj = Project.find_by_identifier(project)
Chris@978 51 if pobj
Chris@978 52 @projects[project] = pobj
Chris@979 53 pobj.is_public?
Chris@978 54 else
Chris@979 55 print "Project not found: ", project, "\n"
Chris@978 56 false
Chris@978 57 end
Chris@978 58 end
Chris@978 59 end
Chris@978 60
Chris@980 61 def print_stats(h)
Chris@980 62 h.keys.sort { |a,b| h[b] <=> h[a] }.each do |p|
Chris@982 63 if h[p] > 0
Chris@983 64 print h[p], " ", @projects[p].name, " [", p, "]\n"
Chris@982 65 end
Chris@980 66 end
Chris@980 67 end
Chris@980 68
Chris@979 69 STDIN.each do |line|
Chris@975 70
Chris@975 71 record = parser.parse(line)
Chris@975 72
Chris@975 73 # most annoyingly, the parser can't handle the comma-separated list
Chris@975 74 # in X-Forwarded-For where it has more than one element. If it has
Chris@983 75 # failed, remove any IP addresses or the word "unknown" with
Chris@983 76 # trailing commas and try again
Chris@975 77 if not record
Chris@983 78 filtered = line.gsub(/(unknown|([0-9]+\.){3}[0-9]+),\s*/, "")
Chris@975 79 record = parser.parse(filtered)
Chris@975 80 end
Chris@975 81
Chris@975 82 # discard, but count, unparseable lines
Chris@975 83 if not record
Chris@979 84 print "Line not parseable: ", line, "\n"
Chris@975 85 unparseable += 1
Chris@975 86 next
Chris@975 87 end
Chris@975 88
Chris@975 89 # discard everything that isn't a 200 OK response
Chris@975 90 next if record["%>s"] != "200"
Chris@975 91
Chris@975 92 # discard anything apparently requested by a crawler
Chris@975 93 next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
Chris@975 94
Chris@975 95 # pull out request e.g. GET / HTTP/1.0
Chris@975 96 request = record["%r"]
Chris@975 97
Chris@975 98 # split into method, path, protocol
Chris@975 99 if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
Chris@979 100 print "Line not parseable (bad method, path, protocol): ", line, "\n"
Chris@975 101 unparseable += 1
Chris@975 102 next
Chris@975 103 end
Chris@975 104
Chris@975 105 # get the path e.g. /projects/weevilmatic and split on /
Chris@975 106 path = $~[1]
Chris@975 107 components = path.split("/")
Chris@975 108
Chris@975 109 # should have at least two elements unless path is "/"; first should
Chris@975 110 # be empty (begins with /)
Chris@975 111 if path != "/" and (components.size < 2 or components[0] != "")
Chris@979 112 print "Line not parseable (degenerate path): ", line, "\n"
Chris@975 113 unparseable += 1
Chris@975 114 next
Chris@975 115 end
Chris@975 116
Chris@975 117 if components[1] == "hg"
Chris@975 118
Chris@975 119 # path is /hg/project?something or /hg/project/something
Chris@975 120
Chris@975 121 project = components[2].split("?")[0]
Chris@979 122 if not is_public_project?(project)
Chris@978 123 next
Chris@978 124 end
Chris@975 125
Chris@975 126 if components[2] =~ /&roots=00*$/
Chris@975 127 clones[project] += 1
Chris@975 128 elsif components[2] =~ /cmd=capabilities/
Chris@975 129 pulls[project] += 1
Chris@978 130 elsif components[2] =~ /cmd=unbundle/
Chris@978 131 pushes[project] += 1
Chris@975 132 elsif components[3] == "archive"
Chris@975 133 zips[project] += 1
Chris@975 134 end
Chris@975 135
Chris@975 136 elsif components[1] == "projects"
Chris@975 137
Chris@975 138 # path is /projects/project or /projects/project/something
Chris@975 139
Chris@975 140 project = components[2]
Chris@979 141 project = project.split("?")[0] if project
Chris@979 142 if not is_public_project?(project)
Chris@978 143 next
Chris@975 144 end
Chris@975 145
Chris@978 146 hits[project] += 1
Chris@978 147
Chris@975 148 end
Chris@975 149
Chris@975 150 parseable += 1
Chris@975 151 end
Chris@975 152
Chris@975 153 # Each clone is also a pull; deduct it from the pulls hash, because we
Chris@975 154 # want that to contain only non-clone pulls
Chris@975 155
Chris@975 156 clones.keys.each do |project|
Chris@975 157 pulls[project] -= 1
Chris@975 158 end
Chris@975 159
Chris@982 160 print parseable, " parseable\n"
Chris@982 161 print unparseable, " unparseable\n"
Chris@982 162
Chris@982 163
Chris@980 164 print "\nMercurial clones:\n"
Chris@980 165 print_stats clones
Chris@980 166
Chris@980 167 print "\nMercurial pulls (excluding clones):\n"
Chris@980 168 print_stats pulls
Chris@980 169
Chris@980 170 print "\nMercurial pushes:\n"
Chris@980 171 print_stats pushes
Chris@980 172
Chris@980 173 print "\nMercurial archive (zip file) downloads:\n"
Chris@980 174 print_stats zips
Chris@980 175
Chris@982 176 print "\nProject page hits (excluding crawlers):\n"
Chris@980 177 print_stats hits
Chris@975 178
Chris@975 179