To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / extra / soundsoftware / get-apache-log-stats.rb @ 975:198f764e734c

History | View | Annotate | Download (2.81 KB)

1 975:198f764e734c Chris
2
# Use the ApacheLogRegex parser, a neat thing
3
# See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
4
require 'apachelogregex'
5
6
# This is the format defined in our httpd.conf
7
vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
8
9
parser = ApacheLogRegex.new(vhost_combined_format)
10
11
# project name -> count of hg clones
12
clones = Hash.new(0)
13
14
# project name -> count of hg pulls
15
pulls = Hash.new(0)
16
17
# project name -> count of hg commits
18
commits = Hash.new(0)
19
20
# project name -> count of hg archive requests (i.e. Download as Zip)
21
zips = Hash.new(0)
22
23
# project name -> count of hits to pages under /projects/projectname
24
hits = Hash.new(0)
25
26
parseable = 0
27
unparseable = 0
28
29
ARGF.each do |line|
30
31
  record = parser.parse(line)
32
33
  # most annoyingly, the parser can't handle the comma-separated list
34
  # in X-Forwarded-For where it has more than one element. If it has
35
  # failed, remove any IP addresses with trailing commas and try again
36
  if not record
37
    filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "")
38
    record = parser.parse(filtered)
39
  end
40
41
  # discard, but count, unparseable lines
42
  if not record
43
    unparseable += 1
44
    next
45
  end
46
47
  # discard everything that isn't a 200 OK response
48
  next if record["%>s"] != "200"
49
50
  # discard anything apparently requested by a crawler
51
  next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
52
53
  # pull out request e.g. GET / HTTP/1.0
54
  request = record["%r"]
55
56
  # split into method, path, protocol
57
  if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
58
    unparseable += 1
59
    next
60
  end
61
62
  # get the path e.g. /projects/weevilmatic and split on /
63
  path = $~[1]
64
  components = path.split("/")
65
66
  # should have at least two elements unless path is "/"; first should
67
  # be empty (begins with /)
68
  if path != "/" and (components.size < 2 or components[0] != "")
69
    unparseable += 1
70
    next
71
  end
72
73
  if components[1] == "hg"
74
75
    # path is /hg/project?something or /hg/project/something
76
77
    project = components[2].split("?")[0]
78
79
    if components[2] =~ /&roots=00*$/
80
      clones[project] += 1
81
    elsif components[2] =~ /cmd=capabilities/
82
      pulls[project] += 1
83
    elsif components[3] == "archive"
84
      zips[project] += 1
85
    end
86
87
  elsif components[1] == "projects"
88
89
    # path is /projects/project or /projects/project/something
90
91
    project = components[2]
92
    if project
93
      project = project.split("?")[0]
94
      hits[project] += 1
95
    end
96
97
  end
98
99
  parseable += 1
100
end
101
102
# Each clone is also a pull; deduct it from the pulls hash, because we
103
# want that to contain only non-clone pulls
104
105
clones.keys.each do |project|
106
  pulls[project] -= 1
107
end
108
109
print clones, "\n"
110
print pulls, "\n"
111
print zips, "\n"
112
print hits, "\n"
113
114
print parseable, " parseable\n"
115
print unparseable, " unparseable\n"