Revision 975:198f764e734c extra

View differences:

extra/soundsoftware/get-apache-log-stats.rb
1

  
2
# Use the ApacheLogRegex parser, a neat thing
3
# See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
4
require 'apachelogregex'
5

  
6
# This is the format defined in our httpd.conf
7
vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
8

  
9
parser = ApacheLogRegex.new(vhost_combined_format)
10

  
11
# project name -> count of hg clones
12
clones = Hash.new(0)
13

  
14
# project name -> count of hg pulls
15
pulls = Hash.new(0)
16

  
17
# project name -> count of hg commits
18
commits = Hash.new(0)
19

  
20
# project name -> count of hg archive requests (i.e. Download as Zip)
21
zips = Hash.new(0)
22

  
23
# project name -> count of hits to pages under /projects/projectname
24
hits = Hash.new(0)
25

  
26
parseable = 0
27
unparseable = 0
28

  
29
ARGF.each do |line|
30

  
31
  record = parser.parse(line)
32

  
33
  # most annoyingly, the parser can't handle the comma-separated list
34
  # in X-Forwarded-For where it has more than one element. If it has
35
  # failed, remove any IP addresses with trailing commas and try again
36
  if not record
37
    filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "")
38
    record = parser.parse(filtered)
39
  end
40

  
41
  # discard, but count, unparseable lines
42
  if not record
43
    unparseable += 1
44
    next
45
  end
46

  
47
  # discard everything that isn't a 200 OK response
48
  next if record["%>s"] != "200"
49

  
50
  # discard anything apparently requested by a crawler
51
  next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
52

  
53
  # pull out request e.g. GET / HTTP/1.0
54
  request = record["%r"]
55

  
56
  # split into method, path, protocol
57
  if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
58
    unparseable += 1
59
    next
60
  end
61

  
62
  # get the path e.g. /projects/weevilmatic and split on /
63
  path = $~[1]
64
  components = path.split("/")
65
  
66
  # should have at least two elements unless path is "/"; first should
67
  # be empty (begins with /)
68
  if path != "/" and (components.size < 2 or components[0] != "")
69
    unparseable += 1
70
    next
71
  end
72

  
73
  if components[1] == "hg"
74
    
75
    # path is /hg/project?something or /hg/project/something
76

  
77
    project = components[2].split("?")[0]
78

  
79
    if components[2] =~ /&roots=00*$/
80
      clones[project] += 1
81
    elsif components[2] =~ /cmd=capabilities/
82
      pulls[project] += 1
83
    elsif components[3] == "archive"
84
      zips[project] += 1
85
    end
86

  
87
  elsif components[1] == "projects"
88

  
89
    # path is /projects/project or /projects/project/something
90

  
91
    project = components[2]
92
    if project
93
      project = project.split("?")[0]
94
      hits[project] += 1
95
    end
96

  
97
  end
98

  
99
  parseable += 1
100
end
101

  
102
# Each clone is also a pull; deduct it from the pulls hash, because we
103
# want that to contain only non-clone pulls
104

  
105
clones.keys.each do |project|
106
  pulls[project] -= 1
107
end
108

  
109
print clones, "\n"
110
print pulls, "\n"
111
print zips, "\n"
112
print hits, "\n"
113

  
114
print parseable, " parseable\n"
115
print unparseable, " unparseable\n"
116

  

Also available in: Unified diff