Revision 981:3469444470cb extra/soundsoftware

View differences:

extra/soundsoftware/get-apache-log-stats.rb
1

  
2
# Read an Apache log file in SoundSoftware site format from stdin and
3
# produce some per-project stats.
4
#
5
# Invoke with e.g.
6
#
7
# cat /var/log/apache2/code-access.log | \
8
#   script/runner -e production extra/soundsoftware/get-apache-log-stats.rb
9

  
10

  
11
# Use the ApacheLogRegex parser, a neat thing
12
# See http://www.simonecarletti.com/blog/2009/02/apache-log-regex-a-lightweight-ruby-apache-log-parser/
13
require 'apachelogregex'
14

  
15
# This is the format defined in our httpd.conf
16
vhost_combined_format = '%v:%p %h %{X-Forwarded-For}i %l %u %t \"%r\" %>s %O \"%{Referer}i\" \"%{User-Agent}i\"'
17

  
18
parser = ApacheLogRegex.new(vhost_combined_format)
19

  
20
# project name -> count of hg clones
21
clones = Hash.new(0)
22

  
23
# project name -> count of hg pulls
24
pulls = Hash.new(0)
25

  
26
# project name -> count of hg pushes
27
pushes = Hash.new(0)
28

  
29
# project name -> count of hg archive requests (i.e. Download as Zip)
30
zips = Hash.new(0)
31

  
32
# project name -> count of hits to pages under /projects/projectname
33
hits = Hash.new(0)
34

  
35
# project name -> Project object
36
@projects = Hash.new
37

  
38
parseable = 0
39
unparseable = 0
40

  
41
def is_public_project?(project)
42
  if !project
43
    false
44
  elsif @projects.key?(project)
45
    @projects[project].is_public? 
46
  else
47
    pobj = Project.find_by_identifier(project)
48
    if pobj
49
      @projects[project] = pobj
50
      pobj.is_public?
51
    else
52
      print "Project not found: ", project, "\n"
53
      false
54
    end
55
  end
56
end
57

  
58
def print_stats(h)
59
  h.keys.sort { |a,b| h[b] <=> h[a] }.each do |p|
60
    print h[p], " ", @projects[p].name, "\n"
61
  end
62
end
63

  
64
STDIN.each do |line|
65

  
66
  record = parser.parse(line)
67

  
68
  # most annoyingly, the parser can't handle the comma-separated list
69
  # in X-Forwarded-For where it has more than one element. If it has
70
  # failed, remove any IP addresses with trailing commas and try again
71
  if not record
72
    filtered = line.gsub(/([0-9]+\.){3}[0-9]+,\s*/, "")
73
    record = parser.parse(filtered)
74
  end
75

  
76
  # discard, but count, unparseable lines
77
  if not record
78
    print "Line not parseable: ", line, "\n"
79
    unparseable += 1
80
    next
81
  end
82

  
83
  # discard everything that isn't a 200 OK response
84
  next if record["%>s"] != "200"
85

  
86
  # discard anything apparently requested by a crawler
87
  next if record["%{User-Agent}i"] =~ /(bot|slurp|crawler|spider|Redmine)\b/i
88

  
89
  # pull out request e.g. GET / HTTP/1.0
90
  request = record["%r"]
91

  
92
  # split into method, path, protocol
93
  if not request =~ /^[^\s]+ ([^\s]+) [^\s]+$/
94
    print "Line not parseable (bad method, path, protocol): ", line, "\n"
95
    unparseable += 1
96
    next
97
  end
98

  
99
  # get the path e.g. /projects/weevilmatic and split on /
100
  path = $~[1]
101
  components = path.split("/")
102
  
103
  # should have at least two elements unless path is "/"; first should
104
  # be empty (begins with /)
105
  if path != "/" and (components.size < 2 or components[0] != "")
106
    print "Line not parseable (degenerate path): ", line, "\n"
107
    unparseable += 1
108
    next
109
  end
110

  
111
  if components[1] == "hg"
112
    
113
    # path is /hg/project?something or /hg/project/something
114

  
115
    project = components[2].split("?")[0]
116
    if not is_public_project?(project)
117
      next
118
    end
119

  
120
    if components[2] =~ /&roots=00*$/
121
      clones[project] += 1
122
    elsif components[2] =~ /cmd=capabilities/
123
      pulls[project] += 1
124
    elsif components[2] =~ /cmd=unbundle/
125
      pushes[project] += 1
126
    elsif components[3] == "archive"
127
      zips[project] += 1
128
    end
129

  
130
  elsif components[1] == "projects"
131

  
132
    # path is /projects/project or /projects/project/something
133

  
134
    project = components[2]
135
    project = project.split("?")[0] if project
136
    if not is_public_project?(project)
137
      next
138
    end
139

  
140
    hits[project] += 1
141

  
142
  end
143

  
144
  parseable += 1
145
end
146

  
147
# Each clone is also a pull; deduct it from the pulls hash, because we
148
# want that to contain only non-clone pulls
149

  
150
clones.keys.each do |project|
151
  pulls[project] -= 1
152
end
153

  
154
print "\nMercurial clones:\n"
155
print_stats clones
156

  
157
print "\nMercurial pulls (excluding clones):\n"
158
print_stats pulls
159

  
160
print "\nMercurial pushes:\n"
161
print_stats pushes
162

  
163
print "\nMercurial archive (zip file) downloads:\n"
164
print_stats zips
165

  
166
print "\nProject page hits:\n"
167
print_stats hits
168

  
169
print parseable, " parseable\n"
170
print unparseable, " unparseable\n"
171

  

Also available in: Unified diff