To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / extra / soundsoftware / get-apache-log-stats.rb @ 1557:9d6d2f696782

History | View | Annotate | Download (4.3 KB)

1 978:bbb88c44f805 Chris
2 980:9b4919de5317 Chris
# Read an Apache log file in SoundSoftware site format from stdin and
3
# produce some per-project stats.
4 978:bbb88c44f805 Chris
#
5
# Invoke with e.g.
6
#
7
# cat /var/log/apache2/code-access.log | \
8
#   script/runner -e production extra/soundsoftware/get-apache-log-stats.rb
9
10 975:198f764e734c Chris
11
# Use the ApacheLogRegex parser, a neat thing
12
# See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
13
require 'apachelogregex'
14
15
# This is the format defined in our httpd.conf
16
vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
17
18
parser = ApacheLogRegex.new(vhost_combined_format)
19
20
# project name -> count of hg clones
21
clones = Hash.new(0)
22
23
# project name -> count of hg pulls
24
pulls = Hash.new(0)
25
26 978:bbb88c44f805 Chris
# project name -> count of hg pushes
27
pushes = Hash.new(0)
28 975:198f764e734c Chris
29
# project name -> count of hg archive requests (i.e. Download as Zip)
30
zips = Hash.new(0)
31
32
# project name -> count of hits to pages under /projects/projectname
33
hits = Hash.new(0)
34
35 978:bbb88c44f805 Chris
# project name -> Project object
36
@projects = Hash.new
37
38 975:198f764e734c Chris
parseable = 0
39
unparseable = 0
40
41 979:56a38a9f6204 Chris
def is_public_project?(project)
42 978:bbb88c44f805 Chris
  if !project
43
    false
44 983:a97e573d7f87 Chris
  elsif project =~ /^\d+$/
45
    # ignore numerical project ids, they are only used when editing projects
46
    false
47 978:bbb88c44f805 Chris
  elsif @projects.key?(project)
48 979:56a38a9f6204 Chris
    @projects[project].is_public?
49 978:bbb88c44f805 Chris
  else
50
    pobj = Project.find_by_identifier(project)
51
    if pobj
52
      @projects[project] = pobj
53 979:56a38a9f6204 Chris
      pobj.is_public?
54 978:bbb88c44f805 Chris
    else
55 979:56a38a9f6204 Chris
      print "Project not found: ", project, "\n"
56 978:bbb88c44f805 Chris
      false
57
    end
58
  end
59
end
60
61 980:9b4919de5317 Chris
def print_stats(h)
62
  h.keys.sort { |a,b| h[b] <=> h[a] }.each do |p|
63 982:6edb748be064 Chris
    if h[p] > 0
64 983:a97e573d7f87 Chris
      print h[p], " ", @projects[p].name, " [", p, "]\n"
65 982:6edb748be064 Chris
    end
66 980:9b4919de5317 Chris
  end
67
end
68
69 979:56a38a9f6204 Chris
STDIN.each do |line|
70 975:198f764e734c Chris
71
  record = parser.parse(line)
72
73
  # most annoyingly, the parser can't handle the comma-separated list
74
  # in X-Forwarded-For where it has more than one element. If it has
75 983:a97e573d7f87 Chris
  # failed, remove any IP addresses or the word "unknown" with
76
  # trailing commas and try again
77 975:198f764e734c Chris
  if not record
78 983:a97e573d7f87 Chris
    filtered = line.gsub(/(unknown|([0-9]+\.){3}[0-9]+),\s*/, "")
79 975:198f764e734c Chris
    record = parser.parse(filtered)
80
  end
81
82
  # discard, but count, unparseable lines
83
  if not record
84 979:56a38a9f6204 Chris
    print "Line not parseable: ", line, "\n"
85 975:198f764e734c Chris
    unparseable += 1
86
    next
87
  end
88
89
  # discard everything that isn't a 200 OK response
90
  next if record["%>s"] != "200"
91
92
  # discard anything apparently requested by a crawler
93
  next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
94
95
  # pull out request e.g. GET / HTTP/1.0
96
  request = record["%r"]
97
98
  # split into method, path, protocol
99
  if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
100 979:56a38a9f6204 Chris
    print "Line not parseable (bad method, path, protocol): ", line, "\n"
101 975:198f764e734c Chris
    unparseable += 1
102
    next
103
  end
104
105
  # get the path e.g. /projects/weevilmatic and split on /
106
  path = $~[1]
107
  components = path.split("/")
108
109
  # should have at least two elements unless path is "/"; first should
110
  # be empty (begins with /)
111
  if path != "/" and (components.size < 2 or components[0] != "")
112 979:56a38a9f6204 Chris
    print "Line not parseable (degenerate path): ", line, "\n"
113 975:198f764e734c Chris
    unparseable += 1
114
    next
115
  end
116
117
  if components[1] == "hg"
118
119
    # path is /hg/project?something or /hg/project/something
120
121
    project = components[2].split("?")[0]
122 979:56a38a9f6204 Chris
    if not is_public_project?(project)
123 978:bbb88c44f805 Chris
      next
124
    end
125 975:198f764e734c Chris
126
    if components[2] =~ /&roots=00*$/
127
      clones[project] += 1
128
    elsif components[2] =~ /cmd=capabilities/
129
      pulls[project] += 1
130 978:bbb88c44f805 Chris
    elsif components[2] =~ /cmd=unbundle/
131
      pushes[project] += 1
132 975:198f764e734c Chris
    elsif components[3] == "archive"
133
      zips[project] += 1
134
    end
135
136
  elsif components[1] == "projects"
137
138
    # path is /projects/project or /projects/project/something
139
140
    project = components[2]
141 979:56a38a9f6204 Chris
    project = project.split("?")[0] if project
142
    if not is_public_project?(project)
143 978:bbb88c44f805 Chris
      next
144 975:198f764e734c Chris
    end
145
146 978:bbb88c44f805 Chris
    hits[project] += 1
147
148 975:198f764e734c Chris
  end
149
150
  parseable += 1
151
end
152
153
# Each clone is also a pull; deduct it from the pulls hash, because we
154
# want that to contain only non-clone pulls
155
156
clones.keys.each do |project|
157
  pulls[project] -= 1
158
end
159
160 982:6edb748be064 Chris
print parseable, " parseable\n"
161
print unparseable, " unparseable\n"
162
163
164 980:9b4919de5317 Chris
print "\nMercurial clones:\n"
165
print_stats clones
166
167
print "\nMercurial pulls (excluding clones):\n"
168
print_stats pulls
169
170
print "\nMercurial pushes:\n"
171
print_stats pushes
172
173
print "\nMercurial archive (zip file) downloads:\n"
174
print_stats zips
175
176 982:6edb748be064 Chris
print "\nProject page hits (excluding crawlers):\n"
177 980:9b4919de5317 Chris
print_stats hits
178 975:198f764e734c Chris