To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / extra / soundsoftware / get-apache-log-stats.rb @ 979:56a38a9f6204

History | View | Annotate | Download (3.81 KB)

1 978:bbb88c44f805 Chris
2
# Read an Apache log file from the SoundSoftware site and produce some
3
# per-project stats.
4
#
5
# Invoke with e.g.
6
#
7
# cat /var/log/apache2/code-access.log | \
8
#   script/runner -e production extra/soundsoftware/get-apache-log-stats.rb
9
10 975:198f764e734c Chris
11
# Use the ApacheLogRegex parser, a neat thing
12
# See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
13
require 'apachelogregex'
14
15
# This is the format defined in our httpd.conf
16
vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
17
18
parser = ApacheLogRegex.new(vhost_combined_format)
19
20
# project name -> count of hg clones
21
clones = Hash.new(0)
22
23
# project name -> count of hg pulls
24
pulls = Hash.new(0)
25
26 978:bbb88c44f805 Chris
# project name -> count of hg pushes
27
pushes = Hash.new(0)
28 975:198f764e734c Chris
29
# project name -> count of hg archive requests (i.e. Download as Zip)
30
zips = Hash.new(0)
31
32
# project name -> count of hits to pages under /projects/projectname
33
hits = Hash.new(0)
34
35 978:bbb88c44f805 Chris
# project name -> Project object
36
@projects = Hash.new
37
38 975:198f764e734c Chris
parseable = 0
39
unparseable = 0
40
41 979:56a38a9f6204 Chris
def is_public_project?(project)
42 978:bbb88c44f805 Chris
  if !project
43
    false
44
  elsif @projects.key?(project)
45 979:56a38a9f6204 Chris
    @projects[project].is_public?
46 978:bbb88c44f805 Chris
  else
47
    pobj = Project.find_by_identifier(project)
48
    if pobj
49
      @projects[project] = pobj
50 979:56a38a9f6204 Chris
      pobj.is_public?
51 978:bbb88c44f805 Chris
    else
52 979:56a38a9f6204 Chris
      print "Project not found: ", project, "\n"
53 978:bbb88c44f805 Chris
      false
54
    end
55
  end
56
end
57
58 979:56a38a9f6204 Chris
STDIN.each do |line|
59 975:198f764e734c Chris
60
  record = parser.parse(line)
61
62
  # most annoyingly, the parser can't handle the comma-separated list
63
  # in X-Forwarded-For where it has more than one element. If it has
64
  # failed, remove any IP addresses with trailing commas and try again
65
  if not record
66
    filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "")
67
    record = parser.parse(filtered)
68
  end
69
70
  # discard, but count, unparseable lines
71
  if not record
72 979:56a38a9f6204 Chris
    print "Line not parseable: ", line, "\n"
73 975:198f764e734c Chris
    unparseable += 1
74
    next
75
  end
76
77
  # discard everything that isn't a 200 OK response
78
  next if record["%>s"] != "200"
79
80
  # discard anything apparently requested by a crawler
81
  next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
82
83
  # pull out request e.g. GET / HTTP/1.0
84
  request = record["%r"]
85
86
  # split into method, path, protocol
87
  if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
88 979:56a38a9f6204 Chris
    print "Line not parseable (bad method, path, protocol): ", line, "\n"
89 975:198f764e734c Chris
    unparseable += 1
90
    next
91
  end
92
93
  # get the path e.g. /projects/weevilmatic and split on /
94
  path = $~[1]
95
  components = path.split("/")
96
97
  # should have at least two elements unless path is "/"; first should
98
  # be empty (begins with /)
99
  if path != "/" and (components.size < 2 or components[0] != "")
100 979:56a38a9f6204 Chris
    print "Line not parseable (degenerate path): ", line, "\n"
101 975:198f764e734c Chris
    unparseable += 1
102
    next
103
  end
104
105
  if components[1] == "hg"
106
107
    # path is /hg/project?something or /hg/project/something
108
109
    project = components[2].split("?")[0]
110 979:56a38a9f6204 Chris
    if not is_public_project?(project)
111 978:bbb88c44f805 Chris
      next
112
    end
113 975:198f764e734c Chris
114
    if components[2] =~ /&roots=00*$/
115
      clones[project] += 1
116
    elsif components[2] =~ /cmd=capabilities/
117
      pulls[project] += 1
118 978:bbb88c44f805 Chris
    elsif components[2] =~ /cmd=unbundle/
119
      pushes[project] += 1
120 975:198f764e734c Chris
    elsif components[3] == "archive"
121
      zips[project] += 1
122
    end
123
124
  elsif components[1] == "projects"
125
126
    # path is /projects/project or /projects/project/something
127
128
    project = components[2]
129 979:56a38a9f6204 Chris
    project = project.split("?")[0] if project
130
    if not is_public_project?(project)
131 978:bbb88c44f805 Chris
      next
132 975:198f764e734c Chris
    end
133
134 978:bbb88c44f805 Chris
    project = project.split("?")[0]
135
    hits[project] += 1
136
137 975:198f764e734c Chris
  end
138
139
  parseable += 1
140
end
141
142
# Each clone is also a pull; deduct it from the pulls hash, because we
143
# want that to contain only non-clone pulls
144
145
clones.keys.each do |project|
146
  pulls[project] -= 1
147
end
148
149
print clones, "\n"
150
print pulls, "\n"
151 978:bbb88c44f805 Chris
print pushes, "\n"
152 975:198f764e734c Chris
print zips, "\n"
153
print hits, "\n"
154
155
print parseable, " parseable\n"
156
print unparseable, " unparseable\n"