comparison vendor/gems/coderay-1.0.0/lib/coderay/scanner.rb @ 909:cbb26bc654de redmine-1.3

Update to Redmine 1.3-stable branch (Redmine SVN rev 8964)
author Chris Cannam
date Fri, 24 Feb 2012 19:09:32 +0000
parents
children
comparison
equal deleted inserted replaced
908:c6c2cbd0afee 909:cbb26bc654de
1 # encoding: utf-8
2 require 'strscan'
3
4 module CodeRay
5
6 autoload :WordList, 'coderay/helpers/word_list'
7
8 # = Scanners
9 #
10 # This module holds the Scanner class and its subclasses.
11 # For example, the Ruby scanner is named CodeRay::Scanners::Ruby
12 # can be found in coderay/scanners/ruby.
13 #
14 # Scanner also provides methods and constants for the register
15 # mechanism and the [] method that returns the Scanner class
16 # belonging to the given lang.
17 #
18 # See PluginHost.
19 module Scanners
20 extend PluginHost
21 plugin_path File.dirname(__FILE__), 'scanners'
22
23
24 # = Scanner
25 #
26 # The base class for all Scanners.
27 #
28 # It is a subclass of Ruby's great +StringScanner+, which
29 # makes it easy to access the scanning methods inside.
30 #
31 # It is also +Enumerable+, so you can use it like an Array of
32 # Tokens:
33 #
34 # require 'coderay'
35 #
36 # c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"
37 #
38 # for text, kind in c_scanner
39 # puts text if kind == :operator
40 # end
41 #
42 # # prints: (*==)++;
43 #
44 # OK, this is a very simple example :)
45 # You can also use +map+, +any?+, +find+ and even +sort_by+,
46 # if you want.
47 class Scanner < StringScanner
48
49 extend Plugin
50 plugin_host Scanners
51
52 # Raised if a Scanner fails while scanning
53 ScanError = Class.new StandardError
54
55 # The default options for all scanner classes.
56 #
57 # Define @default_options for subclasses.
58 DEFAULT_OPTIONS = { }
59
60 KINDS_NOT_LOC = [:comment, :doctype, :docstring]
61
62 attr_accessor :state
63
64 class << self
65
66 # Normalizes the given code into a string with UNIX newlines, in the
67 # scanner's internal encoding, with invalid and undefined charachters
68 # replaced by placeholders. Always returns a new object.
69 def normalize code
70 # original = code
71 code = code.to_s unless code.is_a? ::String
72 return code if code.empty?
73
74 if code.respond_to? :encoding
75 code = encode_with_encoding code, self.encoding
76 else
77 code = to_unix code
78 end
79 # code = code.dup if code.eql? original
80 code
81 end
82
83 # The typical filename suffix for this scanner's language.
84 def file_extension extension = lang
85 @file_extension ||= extension.to_s
86 end
87
88 # The encoding used internally by this scanner.
89 def encoding name = 'UTF-8'
90 @encoding ||= defined?(Encoding.find) && Encoding.find(name)
91 end
92
93 # The lang of this Scanner class, which is equal to its Plugin ID.
94 def lang
95 @plugin_id
96 end
97
98 protected
99
100 def encode_with_encoding code, target_encoding
101 if code.encoding == target_encoding
102 if code.valid_encoding?
103 return to_unix(code)
104 else
105 source_encoding = guess_encoding code
106 end
107 else
108 source_encoding = code.encoding
109 end
110 # print "encode_with_encoding from #{source_encoding} to #{target_encoding}"
111 code.encode target_encoding, source_encoding, :universal_newline => true, :undef => :replace, :invalid => :replace
112 end
113
114 def to_unix code
115 code.index(?\r) ? code.gsub(/\r\n?/, "\n") : code
116 end
117
118 def guess_encoding s
119 #:nocov:
120 IO.popen("file -b --mime -", "w+") do |file|
121 file.write s[0, 1024]
122 file.close_write
123 begin
124 Encoding.find file.gets[/charset=([-\w]+)/, 1]
125 rescue ArgumentError
126 Encoding::BINARY
127 end
128 end
129 #:nocov:
130 end
131
132 end
133
134 # Create a new Scanner.
135 #
136 # * +code+ is the input String and is handled by the superclass
137 # StringScanner.
138 # * +options+ is a Hash with Symbols as keys.
139 # It is merged with the default options of the class (you can
140 # overwrite default options here.)
141 #
142 # Else, a Tokens object is used.
143 def initialize code = '', options = {}
144 if self.class == Scanner
145 raise NotImplementedError, "I am only the basic Scanner class. I can't scan anything. :( Use my subclasses."
146 end
147
148 @options = self.class::DEFAULT_OPTIONS.merge options
149
150 super self.class.normalize(code)
151
152 @tokens = options[:tokens] || Tokens.new
153 @tokens.scanner = self if @tokens.respond_to? :scanner=
154
155 setup
156 end
157
158 # Sets back the scanner. Subclasses should redefine the reset_instance
159 # method instead of this one.
160 def reset
161 super
162 reset_instance
163 end
164
165 # Set a new string to be scanned.
166 def string= code
167 code = self.class.normalize(code)
168 super code
169 reset_instance
170 end
171
172 # the Plugin ID for this scanner
173 def lang
174 self.class.lang
175 end
176
177 # the default file extension for this scanner
178 def file_extension
179 self.class.file_extension
180 end
181
182 # Scan the code and returns all tokens in a Tokens object.
183 def tokenize source = nil, options = {}
184 options = @options.merge(options)
185 @tokens = options[:tokens] || @tokens || Tokens.new
186 @tokens.scanner = self if @tokens.respond_to? :scanner=
187 case source
188 when Array
189 self.string = self.class.normalize(source.join)
190 when nil
191 reset
192 else
193 self.string = self.class.normalize(source)
194 end
195
196 begin
197 scan_tokens @tokens, options
198 rescue => e
199 message = "Error in %s#scan_tokens, initial state was: %p" % [self.class, defined?(state) && state]
200 raise_inspect e.message, @tokens, message, 30, e.backtrace
201 end
202
203 @cached_tokens = @tokens
204 if source.is_a? Array
205 @tokens.split_into_parts(*source.map { |part| part.size })
206 else
207 @tokens
208 end
209 end
210
211 # Cache the result of tokenize.
212 def tokens
213 @cached_tokens ||= tokenize
214 end
215
216 # Traverse the tokens.
217 def each &block
218 tokens.each(&block)
219 end
220 include Enumerable
221
222 # The current line position of the scanner, starting with 1.
223 # See also: #column.
224 #
225 # Beware, this is implemented inefficiently. It should be used
226 # for debugging only.
227 def line pos = self.pos
228 return 1 if pos <= 0
229 binary_string[0...pos].count("\n") + 1
230 end
231
232 # The current column position of the scanner, starting with 1.
233 # See also: #line.
234 def column pos = self.pos
235 return 1 if pos <= 0
236 pos - (binary_string.rindex(?\n, pos - 1) || -1)
237 end
238
239 # The string in binary encoding.
240 #
241 # To be used with #pos, which is the index of the byte the scanner
242 # will scan next.
243 def binary_string
244 @binary_string ||=
245 if string.respond_to?(:bytesize) && string.bytesize != string.size
246 #:nocov:
247 string.dup.force_encoding('binary')
248 #:nocov:
249 else
250 string
251 end
252 end
253
254 protected
255
256 # Can be implemented by subclasses to do some initialization
257 # that has to be done once per instance.
258 #
259 # Use reset for initialization that has to be done once per
260 # scan.
261 def setup # :doc:
262 end
263
264 # This is the central method, and commonly the only one a
265 # subclass implements.
266 #
267 # Subclasses must implement this method; it must return +tokens+
268 # and must only use Tokens#<< for storing scanned tokens!
269 def scan_tokens tokens, options # :doc:
270 raise NotImplementedError, "#{self.class}#scan_tokens not implemented."
271 end
272
273 # Resets the scanner.
274 def reset_instance
275 @tokens.clear if @tokens.respond_to?(:clear) && !@options[:keep_tokens]
276 @cached_tokens = nil
277 @binary_string = nil if defined? @binary_string
278 end
279
280 # Scanner error with additional status information
281 def raise_inspect msg, tokens, state = self.state || 'No state given!', ambit = 30, backtrace = caller
282 raise ScanError, <<-EOE % [
283
284
285 ***ERROR in %s: %s (after %d tokens)
286
287 tokens:
288 %s
289
290 current line: %d column: %d pos: %d
291 matched: %p state: %p
292 bol? = %p, eos? = %p
293
294 surrounding code:
295 %p ~~ %p
296
297
298 ***ERROR***
299
300 EOE
301 File.basename(caller[0]),
302 msg,
303 tokens.respond_to?(:size) ? tokens.size : 0,
304 tokens.respond_to?(:last) ? tokens.last(10).map { |t| t.inspect }.join("\n") : '',
305 line, column, pos,
306 matched, state, bol?, eos?,
307 binary_string[pos - ambit, ambit],
308 binary_string[pos, ambit],
309 ], backtrace
310 end
311
312 # Shorthand for scan_until(/\z/).
313 # This method also avoids a JRuby 1.9 mode bug.
314 def scan_rest
315 rest = self.rest
316 terminate
317 rest
318 end
319
320 end
321
322 end
323 end