Chris@909
|
1 # encoding: utf-8
|
Chris@909
|
2 require 'strscan'
|
Chris@909
|
3
|
Chris@909
|
4 module CodeRay
|
Chris@909
|
5
|
Chris@909
|
6 autoload :WordList, 'coderay/helpers/word_list'
|
Chris@909
|
7
|
Chris@909
|
8 # = Scanners
|
Chris@909
|
9 #
|
Chris@909
|
10 # This module holds the Scanner class and its subclasses.
|
Chris@909
|
11 # For example, the Ruby scanner is named CodeRay::Scanners::Ruby
|
Chris@909
|
12 # can be found in coderay/scanners/ruby.
|
Chris@909
|
13 #
|
Chris@909
|
14 # Scanner also provides methods and constants for the register
|
Chris@909
|
15 # mechanism and the [] method that returns the Scanner class
|
Chris@909
|
16 # belonging to the given lang.
|
Chris@909
|
17 #
|
Chris@909
|
18 # See PluginHost.
|
Chris@909
|
19 module Scanners
|
Chris@909
|
20 extend PluginHost
|
Chris@909
|
21 plugin_path File.dirname(__FILE__), 'scanners'
|
Chris@909
|
22
|
Chris@909
|
23
|
Chris@909
|
24 # = Scanner
|
Chris@909
|
25 #
|
Chris@909
|
26 # The base class for all Scanners.
|
Chris@909
|
27 #
|
Chris@909
|
28 # It is a subclass of Ruby's great +StringScanner+, which
|
Chris@909
|
29 # makes it easy to access the scanning methods inside.
|
Chris@909
|
30 #
|
Chris@909
|
31 # It is also +Enumerable+, so you can use it like an Array of
|
Chris@909
|
32 # Tokens:
|
Chris@909
|
33 #
|
Chris@909
|
34 # require 'coderay'
|
Chris@909
|
35 #
|
Chris@909
|
36 # c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"
|
Chris@909
|
37 #
|
Chris@909
|
38 # for text, kind in c_scanner
|
Chris@909
|
39 # puts text if kind == :operator
|
Chris@909
|
40 # end
|
Chris@909
|
41 #
|
Chris@909
|
42 # # prints: (*==)++;
|
Chris@909
|
43 #
|
Chris@909
|
44 # OK, this is a very simple example :)
|
Chris@909
|
45 # You can also use +map+, +any?+, +find+ and even +sort_by+,
|
Chris@909
|
46 # if you want.
|
Chris@909
|
47 class Scanner < StringScanner
|
Chris@909
|
48
|
Chris@909
|
49 extend Plugin
|
Chris@909
|
50 plugin_host Scanners
|
Chris@909
|
51
|
Chris@909
|
52 # Raised if a Scanner fails while scanning
|
Chris@909
|
53 ScanError = Class.new StandardError
|
Chris@909
|
54
|
Chris@909
|
55 # The default options for all scanner classes.
|
Chris@909
|
56 #
|
Chris@909
|
57 # Define @default_options for subclasses.
|
Chris@909
|
58 DEFAULT_OPTIONS = { }
|
Chris@909
|
59
|
Chris@909
|
60 KINDS_NOT_LOC = [:comment, :doctype, :docstring]
|
Chris@909
|
61
|
Chris@909
|
62 attr_accessor :state
|
Chris@909
|
63
|
Chris@909
|
64 class << self
|
Chris@909
|
65
|
Chris@909
|
66 # Normalizes the given code into a string with UNIX newlines, in the
|
Chris@909
|
67 # scanner's internal encoding, with invalid and undefined charachters
|
Chris@909
|
68 # replaced by placeholders. Always returns a new object.
|
Chris@909
|
69 def normalize code
|
Chris@909
|
70 # original = code
|
Chris@909
|
71 code = code.to_s unless code.is_a? ::String
|
Chris@909
|
72 return code if code.empty?
|
Chris@909
|
73
|
Chris@909
|
74 if code.respond_to? :encoding
|
Chris@909
|
75 code = encode_with_encoding code, self.encoding
|
Chris@909
|
76 else
|
Chris@909
|
77 code = to_unix code
|
Chris@909
|
78 end
|
Chris@909
|
79 # code = code.dup if code.eql? original
|
Chris@909
|
80 code
|
Chris@909
|
81 end
|
Chris@909
|
82
|
Chris@909
|
83 # The typical filename suffix for this scanner's language.
|
Chris@909
|
84 def file_extension extension = lang
|
Chris@909
|
85 @file_extension ||= extension.to_s
|
Chris@909
|
86 end
|
Chris@909
|
87
|
Chris@909
|
88 # The encoding used internally by this scanner.
|
Chris@909
|
89 def encoding name = 'UTF-8'
|
Chris@909
|
90 @encoding ||= defined?(Encoding.find) && Encoding.find(name)
|
Chris@909
|
91 end
|
Chris@909
|
92
|
Chris@909
|
93 # The lang of this Scanner class, which is equal to its Plugin ID.
|
Chris@909
|
94 def lang
|
Chris@909
|
95 @plugin_id
|
Chris@909
|
96 end
|
Chris@909
|
97
|
Chris@909
|
98 protected
|
Chris@909
|
99
|
Chris@909
|
100 def encode_with_encoding code, target_encoding
|
Chris@909
|
101 if code.encoding == target_encoding
|
Chris@909
|
102 if code.valid_encoding?
|
Chris@909
|
103 return to_unix(code)
|
Chris@909
|
104 else
|
Chris@909
|
105 source_encoding = guess_encoding code
|
Chris@909
|
106 end
|
Chris@909
|
107 else
|
Chris@909
|
108 source_encoding = code.encoding
|
Chris@909
|
109 end
|
Chris@909
|
110 # print "encode_with_encoding from #{source_encoding} to #{target_encoding}"
|
Chris@909
|
111 code.encode target_encoding, source_encoding, :universal_newline => true, :undef => :replace, :invalid => :replace
|
Chris@909
|
112 end
|
Chris@909
|
113
|
Chris@909
|
114 def to_unix code
|
Chris@909
|
115 code.index(?\r) ? code.gsub(/\r\n?/, "\n") : code
|
Chris@909
|
116 end
|
Chris@909
|
117
|
Chris@909
|
118 def guess_encoding s
|
Chris@909
|
119 #:nocov:
|
Chris@909
|
120 IO.popen("file -b --mime -", "w+") do |file|
|
Chris@909
|
121 file.write s[0, 1024]
|
Chris@909
|
122 file.close_write
|
Chris@909
|
123 begin
|
Chris@909
|
124 Encoding.find file.gets[/charset=([-\w]+)/, 1]
|
Chris@909
|
125 rescue ArgumentError
|
Chris@909
|
126 Encoding::BINARY
|
Chris@909
|
127 end
|
Chris@909
|
128 end
|
Chris@909
|
129 #:nocov:
|
Chris@909
|
130 end
|
Chris@909
|
131
|
Chris@909
|
132 end
|
Chris@909
|
133
|
Chris@909
|
134 # Create a new Scanner.
|
Chris@909
|
135 #
|
Chris@909
|
136 # * +code+ is the input String and is handled by the superclass
|
Chris@909
|
137 # StringScanner.
|
Chris@909
|
138 # * +options+ is a Hash with Symbols as keys.
|
Chris@909
|
139 # It is merged with the default options of the class (you can
|
Chris@909
|
140 # overwrite default options here.)
|
Chris@909
|
141 #
|
Chris@909
|
142 # Else, a Tokens object is used.
|
Chris@909
|
143 def initialize code = '', options = {}
|
Chris@909
|
144 if self.class == Scanner
|
Chris@909
|
145 raise NotImplementedError, "I am only the basic Scanner class. I can't scan anything. :( Use my subclasses."
|
Chris@909
|
146 end
|
Chris@909
|
147
|
Chris@909
|
148 @options = self.class::DEFAULT_OPTIONS.merge options
|
Chris@909
|
149
|
Chris@909
|
150 super self.class.normalize(code)
|
Chris@909
|
151
|
Chris@909
|
152 @tokens = options[:tokens] || Tokens.new
|
Chris@909
|
153 @tokens.scanner = self if @tokens.respond_to? :scanner=
|
Chris@909
|
154
|
Chris@909
|
155 setup
|
Chris@909
|
156 end
|
Chris@909
|
157
|
Chris@909
|
158 # Sets back the scanner. Subclasses should redefine the reset_instance
|
Chris@909
|
159 # method instead of this one.
|
Chris@909
|
160 def reset
|
Chris@909
|
161 super
|
Chris@909
|
162 reset_instance
|
Chris@909
|
163 end
|
Chris@909
|
164
|
Chris@909
|
165 # Set a new string to be scanned.
|
Chris@909
|
166 def string= code
|
Chris@909
|
167 code = self.class.normalize(code)
|
Chris@909
|
168 super code
|
Chris@909
|
169 reset_instance
|
Chris@909
|
170 end
|
Chris@909
|
171
|
Chris@909
|
172 # the Plugin ID for this scanner
|
Chris@909
|
173 def lang
|
Chris@909
|
174 self.class.lang
|
Chris@909
|
175 end
|
Chris@909
|
176
|
Chris@909
|
177 # the default file extension for this scanner
|
Chris@909
|
178 def file_extension
|
Chris@909
|
179 self.class.file_extension
|
Chris@909
|
180 end
|
Chris@909
|
181
|
Chris@909
|
182 # Scan the code and returns all tokens in a Tokens object.
|
Chris@909
|
183 def tokenize source = nil, options = {}
|
Chris@909
|
184 options = @options.merge(options)
|
Chris@909
|
185 @tokens = options[:tokens] || @tokens || Tokens.new
|
Chris@909
|
186 @tokens.scanner = self if @tokens.respond_to? :scanner=
|
Chris@909
|
187 case source
|
Chris@909
|
188 when Array
|
Chris@909
|
189 self.string = self.class.normalize(source.join)
|
Chris@909
|
190 when nil
|
Chris@909
|
191 reset
|
Chris@909
|
192 else
|
Chris@909
|
193 self.string = self.class.normalize(source)
|
Chris@909
|
194 end
|
Chris@909
|
195
|
Chris@909
|
196 begin
|
Chris@909
|
197 scan_tokens @tokens, options
|
Chris@909
|
198 rescue => e
|
Chris@909
|
199 message = "Error in %s#scan_tokens, initial state was: %p" % [self.class, defined?(state) && state]
|
Chris@909
|
200 raise_inspect e.message, @tokens, message, 30, e.backtrace
|
Chris@909
|
201 end
|
Chris@909
|
202
|
Chris@909
|
203 @cached_tokens = @tokens
|
Chris@909
|
204 if source.is_a? Array
|
Chris@909
|
205 @tokens.split_into_parts(*source.map { |part| part.size })
|
Chris@909
|
206 else
|
Chris@909
|
207 @tokens
|
Chris@909
|
208 end
|
Chris@909
|
209 end
|
Chris@909
|
210
|
Chris@909
|
211 # Cache the result of tokenize.
|
Chris@909
|
212 def tokens
|
Chris@909
|
213 @cached_tokens ||= tokenize
|
Chris@909
|
214 end
|
Chris@909
|
215
|
Chris@909
|
216 # Traverse the tokens.
|
Chris@909
|
217 def each &block
|
Chris@909
|
218 tokens.each(&block)
|
Chris@909
|
219 end
|
Chris@909
|
220 include Enumerable
|
Chris@909
|
221
|
Chris@909
|
222 # The current line position of the scanner, starting with 1.
|
Chris@909
|
223 # See also: #column.
|
Chris@909
|
224 #
|
Chris@909
|
225 # Beware, this is implemented inefficiently. It should be used
|
Chris@909
|
226 # for debugging only.
|
Chris@909
|
227 def line pos = self.pos
|
Chris@909
|
228 return 1 if pos <= 0
|
Chris@909
|
229 binary_string[0...pos].count("\n") + 1
|
Chris@909
|
230 end
|
Chris@909
|
231
|
Chris@909
|
232 # The current column position of the scanner, starting with 1.
|
Chris@909
|
233 # See also: #line.
|
Chris@909
|
234 def column pos = self.pos
|
Chris@909
|
235 return 1 if pos <= 0
|
Chris@909
|
236 pos - (binary_string.rindex(?\n, pos - 1) || -1)
|
Chris@909
|
237 end
|
Chris@909
|
238
|
Chris@909
|
239 # The string in binary encoding.
|
Chris@909
|
240 #
|
Chris@909
|
241 # To be used with #pos, which is the index of the byte the scanner
|
Chris@909
|
242 # will scan next.
|
Chris@909
|
243 def binary_string
|
Chris@909
|
244 @binary_string ||=
|
Chris@909
|
245 if string.respond_to?(:bytesize) && string.bytesize != string.size
|
Chris@909
|
246 #:nocov:
|
Chris@909
|
247 string.dup.force_encoding('binary')
|
Chris@909
|
248 #:nocov:
|
Chris@909
|
249 else
|
Chris@909
|
250 string
|
Chris@909
|
251 end
|
Chris@909
|
252 end
|
Chris@909
|
253
|
Chris@909
|
254 protected
|
Chris@909
|
255
|
Chris@909
|
256 # Can be implemented by subclasses to do some initialization
|
Chris@909
|
257 # that has to be done once per instance.
|
Chris@909
|
258 #
|
Chris@909
|
259 # Use reset for initialization that has to be done once per
|
Chris@909
|
260 # scan.
|
Chris@909
|
261 def setup # :doc:
|
Chris@909
|
262 end
|
Chris@909
|
263
|
Chris@909
|
264 # This is the central method, and commonly the only one a
|
Chris@909
|
265 # subclass implements.
|
Chris@909
|
266 #
|
Chris@909
|
267 # Subclasses must implement this method; it must return +tokens+
|
Chris@909
|
268 # and must only use Tokens#<< for storing scanned tokens!
|
Chris@909
|
269 def scan_tokens tokens, options # :doc:
|
Chris@909
|
270 raise NotImplementedError, "#{self.class}#scan_tokens not implemented."
|
Chris@909
|
271 end
|
Chris@909
|
272
|
Chris@909
|
273 # Resets the scanner.
|
Chris@909
|
274 def reset_instance
|
Chris@909
|
275 @tokens.clear if @tokens.respond_to?(:clear) && !@options[:keep_tokens]
|
Chris@909
|
276 @cached_tokens = nil
|
Chris@909
|
277 @binary_string = nil if defined? @binary_string
|
Chris@909
|
278 end
|
Chris@909
|
279
|
Chris@909
|
280 # Scanner error with additional status information
|
Chris@909
|
281 def raise_inspect msg, tokens, state = self.state || 'No state given!', ambit = 30, backtrace = caller
|
Chris@909
|
282 raise ScanError, <<-EOE % [
|
Chris@909
|
283
|
Chris@909
|
284
|
Chris@909
|
285 ***ERROR in %s: %s (after %d tokens)
|
Chris@909
|
286
|
Chris@909
|
287 tokens:
|
Chris@909
|
288 %s
|
Chris@909
|
289
|
Chris@909
|
290 current line: %d column: %d pos: %d
|
Chris@909
|
291 matched: %p state: %p
|
Chris@909
|
292 bol? = %p, eos? = %p
|
Chris@909
|
293
|
Chris@909
|
294 surrounding code:
|
Chris@909
|
295 %p ~~ %p
|
Chris@909
|
296
|
Chris@909
|
297
|
Chris@909
|
298 ***ERROR***
|
Chris@909
|
299
|
Chris@909
|
300 EOE
|
Chris@909
|
301 File.basename(caller[0]),
|
Chris@909
|
302 msg,
|
Chris@909
|
303 tokens.respond_to?(:size) ? tokens.size : 0,
|
Chris@909
|
304 tokens.respond_to?(:last) ? tokens.last(10).map { |t| t.inspect }.join("\n") : '',
|
Chris@909
|
305 line, column, pos,
|
Chris@909
|
306 matched, state, bol?, eos?,
|
Chris@909
|
307 binary_string[pos - ambit, ambit],
|
Chris@909
|
308 binary_string[pos, ambit],
|
Chris@909
|
309 ], backtrace
|
Chris@909
|
310 end
|
Chris@909
|
311
|
Chris@909
|
312 # Shorthand for scan_until(/\z/).
|
Chris@909
|
313 # This method also avoids a JRuby 1.9 mode bug.
|
Chris@909
|
314 def scan_rest
|
Chris@909
|
315 rest = self.rest
|
Chris@909
|
316 terminate
|
Chris@909
|
317 rest
|
Chris@909
|
318 end
|
Chris@909
|
319
|
Chris@909
|
320 end
|
Chris@909
|
321
|
Chris@909
|
322 end
|
Chris@909
|
323 end |