To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / extra / soundsoftware / get-apache-log-stats.rb @ 982:6edb748be064

History | View | Annotate | Download (4.15 KB)

1 978:bbb88c44f805 Chris
2 980:9b4919de5317 Chris
# Read an Apache log file in SoundSoftware site format from stdin and
3
# produce some per-project stats.
4 978:bbb88c44f805 Chris
#
5
# Invoke with e.g.
6
#
7
# cat /var/log/apache2/code-access.log | \
8
#   script/runner -e production extra/soundsoftware/get-apache-log-stats.rb
9
10 975:198f764e734c Chris
11
# Use the ApacheLogRegex parser, a neat thing
12
# See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
13
require 'apachelogregex'
14
15
# This is the format defined in our httpd.conf
16
vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
17
18
parser = ApacheLogRegex.new(vhost_combined_format)
19
20
# project name -> count of hg clones
21
clones = Hash.new(0)
22
23
# project name -> count of hg pulls
24
pulls = Hash.new(0)
25
26 978:bbb88c44f805 Chris
# project name -> count of hg pushes
27
pushes = Hash.new(0)
28 975:198f764e734c Chris
29
# project name -> count of hg archive requests (i.e. Download as Zip)
30
zips = Hash.new(0)
31
32
# project name -> count of hits to pages under /projects/projectname
33
hits = Hash.new(0)
34
35 978:bbb88c44f805 Chris
# project name -> Project object
36
@projects = Hash.new
37
38 975:198f764e734c Chris
parseable = 0
39
unparseable = 0
40
41 979:56a38a9f6204 Chris
def is_public_project?(project)
42 978:bbb88c44f805 Chris
  if !project
43
    false
44
  elsif @projects.key?(project)
45 979:56a38a9f6204 Chris
    @projects[project].is_public?
46 978:bbb88c44f805 Chris
  else
47
    pobj = Project.find_by_identifier(project)
48
    if pobj
49
      @projects[project] = pobj
50 979:56a38a9f6204 Chris
      pobj.is_public?
51 978:bbb88c44f805 Chris
    else
52 979:56a38a9f6204 Chris
      print "Project not found: ", project, "\n"
53 978:bbb88c44f805 Chris
      false
54
    end
55
  end
56
end
57
58 980:9b4919de5317 Chris
def print_stats(h)
59
  h.keys.sort { |a,b| h[b] <=> h[a] }.each do |p|
60 982:6edb748be064 Chris
    if h[p] > 0
61
      print h[p], " ", @projects[p].name, "\n"
62
    end
63 980:9b4919de5317 Chris
  end
64
end
65
66 979:56a38a9f6204 Chris
STDIN.each do |line|
67 975:198f764e734c Chris
68
  record = parser.parse(line)
69
70
  # most annoyingly, the parser can't handle the comma-separated list
71
  # in X-Forwarded-For where it has more than one element. If it has
72
  # failed, remove any IP addresses with trailing commas and try again
73
  if not record
74
    filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "")
75
    record = parser.parse(filtered)
76
  end
77
78
  # discard, but count, unparseable lines
79
  if not record
80 979:56a38a9f6204 Chris
    print "Line not parseable: ", line, "\n"
81 975:198f764e734c Chris
    unparseable += 1
82
    next
83
  end
84
85
  # discard everything that isn't a 200 OK response
86
  next if record["%>s"] != "200"
87
88
  # discard anything apparently requested by a crawler
89
  next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
90
91
  # pull out request e.g. GET / HTTP/1.0
92
  request = record["%r"]
93
94
  # split into method, path, protocol
95
  if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
96 979:56a38a9f6204 Chris
    print "Line not parseable (bad method, path, protocol): ", line, "\n"
97 975:198f764e734c Chris
    unparseable += 1
98
    next
99
  end
100
101
  # get the path e.g. /projects/weevilmatic and split on /
102
  path = $~[1]
103
  components = path.split("/")
104
105
  # should have at least two elements unless path is "/"; first should
106
  # be empty (begins with /)
107
  if path != "/" and (components.size < 2 or components[0] != "")
108 979:56a38a9f6204 Chris
    print "Line not parseable (degenerate path): ", line, "\n"
109 975:198f764e734c Chris
    unparseable += 1
110
    next
111
  end
112
113
  if components[1] == "hg"
114
115
    # path is /hg/project?something or /hg/project/something
116
117
    project = components[2].split("?")[0]
118 979:56a38a9f6204 Chris
    if not is_public_project?(project)
119 978:bbb88c44f805 Chris
      next
120
    end
121 975:198f764e734c Chris
122
    if components[2] =~ /&roots=00*$/
123
      clones[project] += 1
124
    elsif components[2] =~ /cmd=capabilities/
125
      pulls[project] += 1
126 978:bbb88c44f805 Chris
    elsif components[2] =~ /cmd=unbundle/
127
      pushes[project] += 1
128 975:198f764e734c Chris
    elsif components[3] == "archive"
129
      zips[project] += 1
130
    end
131
132
  elsif components[1] == "projects"
133
134
    # path is /projects/project or /projects/project/something
135
136
    project = components[2]
137 979:56a38a9f6204 Chris
    project = project.split("?")[0] if project
138
    if not is_public_project?(project)
139 978:bbb88c44f805 Chris
      next
140 975:198f764e734c Chris
    end
141
142 978:bbb88c44f805 Chris
    hits[project] += 1
143
144 975:198f764e734c Chris
  end
145
146
  parseable += 1
147
end
148
149
# Each clone is also a pull; deduct it from the pulls hash, because we
150
# want that to contain only non-clone pulls
151
152
clones.keys.each do |project|
153
  pulls[project] -= 1
154
end
155
156 982:6edb748be064 Chris
print parseable, " parseable\n"
157
print unparseable, " unparseable\n"
158
159
160 980:9b4919de5317 Chris
print "\nMercurial clones:\n"
161
print_stats clones
162
163
print "\nMercurial pulls (excluding clones):\n"
164
print_stats pulls
165
166
print "\nMercurial pushes:\n"
167
print_stats pushes
168
169
print "\nMercurial archive (zip file) downloads:\n"
170
print_stats zips
171
172 982:6edb748be064 Chris
print "\nProject page hits (excluding crawlers):\n"
173 980:9b4919de5317 Chris
print_stats hits
174 975:198f764e734c Chris