Mercurial > hg > soundsoftware-site
comparison extra/soundsoftware/get-apache-log-stats.rb @ 975:198f764e734c cannam
Add Apache log extraction script
author | Chris Cannam |
---|---|
date | Thu, 25 Oct 2012 13:38:34 +0100 |
parents | |
children | bbb88c44f805 |
comparison
equal
deleted
inserted
replaced
971:b80f97c892bc | 975:198f764e734c |
---|---|
1 | |
2 # Use the ApacheLogRegex parser, a neat thing | |
3 # See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/ | |
4 require 'apachelogregex' | |
5 | |
6 # This is the format defined in our httpd.conf | |
7 vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"' | |
8 | |
9 parser = ApacheLogRegex.new(vhost_combined_format) | |
10 | |
11 # project name -> count of hg clones | |
12 clones = Hash.new(0) | |
13 | |
14 # project name -> count of hg pulls | |
15 pulls = Hash.new(0) | |
16 | |
17 # project name -> count of hg commits | |
18 commits = Hash.new(0) | |
19 | |
20 # project name -> count of hg archive requests (i.e. Download as Zip) | |
21 zips = Hash.new(0) | |
22 | |
23 # project name -> count of hits to pages under /projects/projectname | |
24 hits = Hash.new(0) | |
25 | |
26 parseable = 0 | |
27 unparseable = 0 | |
28 | |
29 ARGF.each do |line| | |
30 | |
31 record = parser.parse(line) | |
32 | |
33 # most annoyingly, the parser can't handle the comma-separated list | |
34 # in X-Forwarded-For where it has more than one element. If it has | |
35 # failed, remove any IP addresses with trailing commas and try again | |
36 if not record | |
37 filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "") | |
38 record = parser.parse(filtered) | |
39 end | |
40 | |
41 # discard, but count, unparseable lines | |
42 if not record | |
43 unparseable += 1 | |
44 next | |
45 end | |
46 | |
47 # discard everything that isn't a 200 OK response | |
48 next if record["%>s"] != "200" | |
49 | |
50 # discard anything apparently requested by a crawler | |
51 next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i | |
52 | |
53 # pull out request e.g. GET / HTTP/1.0 | |
54 request = record["%r"] | |
55 | |
56 # split into method, path, protocol | |
57 if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/ | |
58 unparseable += 1 | |
59 next | |
60 end | |
61 | |
62 # get the path e.g. /projects/weevilmatic and split on / | |
63 path = $~[1] | |
64 components = path.split("/") | |
65 | |
66 # should have at least two elements unless path is "/"; first should | |
67 # be empty (begins with /) | |
68 if path != "/" and (components.size < 2 or components[0] != "") | |
69 unparseable += 1 | |
70 next | |
71 end | |
72 | |
73 if components[1] == "hg" | |
74 | |
75 # path is /hg/project?something or /hg/project/something | |
76 | |
77 project = components[2].split("?")[0] | |
78 | |
79 if components[2] =~ /&roots=00*$/ | |
80 clones[project] += 1 | |
81 elsif components[2] =~ /cmd=capabilities/ | |
82 pulls[project] += 1 | |
83 elsif components[3] == "archive" | |
84 zips[project] += 1 | |
85 end | |
86 | |
87 elsif components[1] == "projects" | |
88 | |
89 # path is /projects/project or /projects/project/something | |
90 | |
91 project = components[2] | |
92 if project | |
93 project = project.split("?")[0] | |
94 hits[project] += 1 | |
95 end | |
96 | |
97 end | |
98 | |
99 parseable += 1 | |
100 end | |
101 | |
102 # Each clone is also a pull; deduct it from the pulls hash, because we | |
103 # want that to contain only non-clone pulls | |
104 | |
105 clones.keys.each do |project| | |
106 pulls[project] -= 1 | |
107 end | |
108 | |
109 print clones, "\n" | |
110 print pulls, "\n" | |
111 print zips, "\n" | |
112 print hits, "\n" | |
113 | |
114 print parseable, " parseable\n" | |
115 print unparseable, " unparseable\n" | |
116 |