Chris@978
|
1
|
Chris@980
|
2 # Read an Apache log file in SoundSoftware site format from stdin and
|
Chris@980
|
3 # produce some per-project stats.
|
Chris@978
|
4 #
|
Chris@978
|
5 # Invoke with e.g.
|
Chris@978
|
6 #
|
Chris@978
|
7 # cat /var/log/apache2/code-access.log | \
|
Chris@978
|
8 # script/runner -e production extra/soundsoftware/get-apache-log-stats.rb
|
Chris@978
|
9
|
Chris@975
|
10
|
Chris@975
|
11 # Use the ApacheLogRegex parser, a neat thing
|
Chris@975
|
12 # See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
|
Chris@975
|
13 require 'apachelogregex'
|
Chris@975
|
14
|
Chris@975
|
15 # This is the format defined in our httpd.conf
|
Chris@975
|
16 vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
|
Chris@975
|
17
|
Chris@975
|
18 parser = ApacheLogRegex.new(vhost_combined_format)
|
Chris@975
|
19
|
Chris@975
|
20 # project name -> count of hg clones
|
Chris@975
|
21 clones = Hash.new(0)
|
Chris@975
|
22
|
Chris@975
|
23 # project name -> count of hg pulls
|
Chris@975
|
24 pulls = Hash.new(0)
|
Chris@975
|
25
|
Chris@978
|
26 # project name -> count of hg pushes
|
Chris@978
|
27 pushes = Hash.new(0)
|
Chris@975
|
28
|
Chris@975
|
29 # project name -> count of hg archive requests (i.e. Download as Zip)
|
Chris@975
|
30 zips = Hash.new(0)
|
Chris@975
|
31
|
Chris@975
|
32 # project name -> count of hits to pages under /projects/projectname
|
Chris@975
|
33 hits = Hash.new(0)
|
Chris@975
|
34
|
Chris@978
|
35 # project name -> Project object
|
Chris@978
|
36 @projects = Hash.new
|
Chris@978
|
37
|
Chris@975
|
38 parseable = 0
|
Chris@975
|
39 unparseable = 0
|
Chris@975
|
40
|
Chris@979
|
41 def is_public_project?(project)
|
Chris@978
|
42 if !project
|
Chris@978
|
43 false
|
Chris@983
|
44 elsif project =~ /^\d+$/
|
Chris@983
|
45 # ignore numerical project ids, they are only used when editing projects
|
Chris@983
|
46 false
|
Chris@978
|
47 elsif @projects.key?(project)
|
Chris@979
|
48 @projects[project].is_public?
|
Chris@978
|
49 else
|
Chris@978
|
50 pobj = Project.find_by_identifier(project)
|
Chris@978
|
51 if pobj
|
Chris@978
|
52 @projects[project] = pobj
|
Chris@979
|
53 pobj.is_public?
|
Chris@978
|
54 else
|
Chris@979
|
55 print "Project not found: ", project, "\n"
|
Chris@978
|
56 false
|
Chris@978
|
57 end
|
Chris@978
|
58 end
|
Chris@978
|
59 end
|
Chris@978
|
60
|
Chris@980
|
61 def print_stats(h)
|
Chris@980
|
62 h.keys.sort { |a,b| h[b] <=> h[a] }.each do |p|
|
Chris@982
|
63 if h[p] > 0
|
Chris@983
|
64 print h[p], " ", @projects[p].name, " [", p, "]\n"
|
Chris@982
|
65 end
|
Chris@980
|
66 end
|
Chris@980
|
67 end
|
Chris@980
|
68
|
Chris@979
|
69 STDIN.each do |line|
|
Chris@975
|
70
|
Chris@975
|
71 record = parser.parse(line)
|
Chris@975
|
72
|
Chris@975
|
73 # most annoyingly, the parser can't handle the comma-separated list
|
Chris@975
|
74 # in X-Forwarded-For where it has more than one element. If it has
|
Chris@983
|
75 # failed, remove any IP addresses or the word "unknown" with
|
Chris@983
|
76 # trailing commas and try again
|
Chris@975
|
77 if not record
|
Chris@983
|
78 filtered = line.gsub(/(unknown|([0-9]+\.){3}[0-9]+),\s*/, "")
|
Chris@975
|
79 record = parser.parse(filtered)
|
Chris@975
|
80 end
|
Chris@975
|
81
|
Chris@975
|
82 # discard, but count, unparseable lines
|
Chris@975
|
83 if not record
|
Chris@979
|
84 print "Line not parseable: ", line, "\n"
|
Chris@975
|
85 unparseable += 1
|
Chris@975
|
86 next
|
Chris@975
|
87 end
|
Chris@975
|
88
|
Chris@975
|
89 # discard everything that isn't a 200 OK response
|
Chris@975
|
90 next if record["%>s"] != "200"
|
Chris@975
|
91
|
Chris@975
|
92 # discard anything apparently requested by a crawler
|
Chris@975
|
93 next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
|
Chris@975
|
94
|
Chris@975
|
95 # pull out request e.g. GET / HTTP/1.0
|
Chris@975
|
96 request = record["%r"]
|
Chris@975
|
97
|
Chris@975
|
98 # split into method, path, protocol
|
Chris@975
|
99 if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
|
Chris@979
|
100 print "Line not parseable (bad method, path, protocol): ", line, "\n"
|
Chris@975
|
101 unparseable += 1
|
Chris@975
|
102 next
|
Chris@975
|
103 end
|
Chris@975
|
104
|
Chris@975
|
105 # get the path e.g. /projects/weevilmatic and split on /
|
Chris@975
|
106 path = $~[1]
|
Chris@975
|
107 components = path.split("/")
|
Chris@975
|
108
|
Chris@975
|
109 # should have at least two elements unless path is "/"; first should
|
Chris@975
|
110 # be empty (begins with /)
|
Chris@975
|
111 if path != "/" and (components.size < 2 or components[0] != "")
|
Chris@979
|
112 print "Line not parseable (degenerate path): ", line, "\n"
|
Chris@975
|
113 unparseable += 1
|
Chris@975
|
114 next
|
Chris@975
|
115 end
|
Chris@975
|
116
|
Chris@975
|
117 if components[1] == "hg"
|
Chris@975
|
118
|
Chris@975
|
119 # path is /hg/project?something or /hg/project/something
|
Chris@975
|
120
|
Chris@975
|
121 project = components[2].split("?")[0]
|
Chris@979
|
122 if not is_public_project?(project)
|
Chris@978
|
123 next
|
Chris@978
|
124 end
|
Chris@975
|
125
|
Chris@975
|
126 if components[2] =~ /&roots=00*$/
|
Chris@975
|
127 clones[project] += 1
|
Chris@975
|
128 elsif components[2] =~ /cmd=capabilities/
|
Chris@975
|
129 pulls[project] += 1
|
Chris@978
|
130 elsif components[2] =~ /cmd=unbundle/
|
Chris@978
|
131 pushes[project] += 1
|
Chris@975
|
132 elsif components[3] == "archive"
|
Chris@975
|
133 zips[project] += 1
|
Chris@975
|
134 end
|
Chris@975
|
135
|
Chris@975
|
136 elsif components[1] == "projects"
|
Chris@975
|
137
|
Chris@975
|
138 # path is /projects/project or /projects/project/something
|
Chris@975
|
139
|
Chris@975
|
140 project = components[2]
|
Chris@979
|
141 project = project.split("?")[0] if project
|
Chris@979
|
142 if not is_public_project?(project)
|
Chris@978
|
143 next
|
Chris@975
|
144 end
|
Chris@975
|
145
|
Chris@978
|
146 hits[project] += 1
|
Chris@978
|
147
|
Chris@975
|
148 end
|
Chris@975
|
149
|
Chris@975
|
150 parseable += 1
|
Chris@975
|
151 end
|
Chris@975
|
152
|
Chris@975
|
153 # Each clone is also a pull; deduct it from the pulls hash, because we
|
Chris@975
|
154 # want that to contain only non-clone pulls
|
Chris@975
|
155
|
Chris@975
|
156 clones.keys.each do |project|
|
Chris@975
|
157 pulls[project] -= 1
|
Chris@975
|
158 end
|
Chris@975
|
159
|
Chris@982
|
160 print parseable, " parseable\n"
|
Chris@982
|
161 print unparseable, " unparseable\n"
|
Chris@982
|
162
|
Chris@982
|
163
|
Chris@980
|
164 print "\nMercurial clones:\n"
|
Chris@980
|
165 print_stats clones
|
Chris@980
|
166
|
Chris@980
|
167 print "\nMercurial pulls (excluding clones):\n"
|
Chris@980
|
168 print_stats pulls
|
Chris@980
|
169
|
Chris@980
|
170 print "\nMercurial pushes:\n"
|
Chris@980
|
171 print_stats pushes
|
Chris@980
|
172
|
Chris@980
|
173 print "\nMercurial archive (zip file) downloads:\n"
|
Chris@980
|
174 print_stats zips
|
Chris@980
|
175
|
Chris@982
|
176 print "\nProject page hits (excluding crawlers):\n"
|
Chris@980
|
177 print_stats hits
|
Chris@975
|
178
|
Chris@975
|
179
|