Chris@210
|
1 module CodeRay
|
Chris@210
|
2
|
Chris@210
|
3 require 'coderay/helpers/plugin'
|
Chris@210
|
4
|
Chris@210
|
5 # = Scanners
|
Chris@210
|
6 #
|
Chris@210
|
7 # This module holds the Scanner class and its subclasses.
|
Chris@210
|
8 # For example, the Ruby scanner is named CodeRay::Scanners::Ruby
|
Chris@210
|
9 # can be found in coderay/scanners/ruby.
|
Chris@210
|
10 #
|
Chris@210
|
11 # Scanner also provides methods and constants for the register
|
Chris@210
|
12 # mechanism and the [] method that returns the Scanner class
|
Chris@210
|
13 # belonging to the given lang.
|
Chris@210
|
14 #
|
Chris@210
|
15 # See PluginHost.
|
Chris@210
|
16 module Scanners
|
Chris@210
|
17 extend PluginHost
|
Chris@210
|
18 plugin_path File.dirname(__FILE__), 'scanners'
|
Chris@210
|
19
|
Chris@210
|
20 require 'strscan'
|
Chris@210
|
21
|
Chris@210
|
22 # = Scanner
|
Chris@210
|
23 #
|
Chris@210
|
24 # The base class for all Scanners.
|
Chris@210
|
25 #
|
Chris@210
|
26 # It is a subclass of Ruby's great +StringScanner+, which
|
Chris@210
|
27 # makes it easy to access the scanning methods inside.
|
Chris@210
|
28 #
|
Chris@210
|
29 # It is also +Enumerable+, so you can use it like an Array of
|
Chris@210
|
30 # Tokens:
|
Chris@210
|
31 #
|
Chris@210
|
32 # require 'coderay'
|
Chris@210
|
33 #
|
Chris@210
|
34 # c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"
|
Chris@210
|
35 #
|
Chris@210
|
36 # for text, kind in c_scanner
|
Chris@210
|
37 # puts text if kind == :operator
|
Chris@210
|
38 # end
|
Chris@210
|
39 #
|
Chris@210
|
40 # # prints: (*==)++;
|
Chris@210
|
41 #
|
Chris@210
|
42 # OK, this is a very simple example :)
|
Chris@210
|
43 # You can also use +map+, +any?+, +find+ and even +sort_by+,
|
Chris@210
|
44 # if you want.
|
Chris@210
|
45 class Scanner < StringScanner
|
Chris@210
|
46
|
Chris@210
|
47 extend Plugin
|
Chris@210
|
48 plugin_host Scanners
|
Chris@210
|
49
|
Chris@210
|
50 # Raised if a Scanner fails while scanning
|
Chris@210
|
51 ScanError = Class.new(Exception)
|
Chris@210
|
52
|
Chris@210
|
53 require 'coderay/helpers/word_list'
|
Chris@210
|
54
|
Chris@210
|
55 # The default options for all scanner classes.
|
Chris@210
|
56 #
|
Chris@210
|
57 # Define @default_options for subclasses.
|
Chris@210
|
58 DEFAULT_OPTIONS = { :stream => false }
|
Chris@210
|
59
|
Chris@210
|
60 KINDS_NOT_LOC = [:comment, :doctype]
|
Chris@210
|
61
|
Chris@210
|
62 class << self
|
Chris@210
|
63
|
Chris@210
|
64 # Returns if the Scanner can be used in streaming mode.
|
Chris@210
|
65 def streamable?
|
Chris@210
|
66 is_a? Streamable
|
Chris@210
|
67 end
|
Chris@210
|
68
|
Chris@210
|
69 def normify code
|
Chris@210
|
70 code = code.to_s
|
Chris@210
|
71 if code.respond_to?(:encoding) && (code.encoding.name != 'UTF-8' || !code.valid_encoding?)
|
Chris@210
|
72 code = code.dup
|
Chris@210
|
73 original_encoding = code.encoding
|
Chris@210
|
74 code.force_encoding 'Windows-1252'
|
Chris@210
|
75 unless code.valid_encoding?
|
Chris@210
|
76 code.force_encoding original_encoding
|
Chris@210
|
77 if code.encoding.name == 'UTF-8'
|
Chris@210
|
78 code.encode! 'UTF-16BE', :invalid => :replace, :undef => :replace, :replace => '?'
|
Chris@210
|
79 end
|
Chris@210
|
80 code.encode! 'UTF-8', :invalid => :replace, :undef => :replace, :replace => '?'
|
Chris@210
|
81 end
|
Chris@210
|
82 end
|
Chris@210
|
83 code.to_unix
|
Chris@210
|
84 end
|
Chris@210
|
85
|
Chris@210
|
86 def file_extension extension = nil
|
Chris@210
|
87 if extension
|
Chris@210
|
88 @file_extension = extension.to_s
|
Chris@210
|
89 else
|
Chris@210
|
90 @file_extension ||= plugin_id.to_s
|
Chris@210
|
91 end
|
Chris@210
|
92 end
|
Chris@210
|
93
|
Chris@210
|
94 end
|
Chris@210
|
95
|
Chris@210
|
96 =begin
|
Chris@210
|
97 ## Excluded for speed reasons; protected seems to make methods slow.
|
Chris@210
|
98
|
Chris@210
|
99 # Save the StringScanner methods from being called.
|
Chris@210
|
100 # This would not be useful for highlighting.
|
Chris@210
|
101 strscan_public_methods =
|
Chris@210
|
102 StringScanner.instance_methods -
|
Chris@210
|
103 StringScanner.ancestors[1].instance_methods
|
Chris@210
|
104 protected(*strscan_public_methods)
|
Chris@210
|
105 =end
|
Chris@210
|
106
|
Chris@210
|
107 # Create a new Scanner.
|
Chris@210
|
108 #
|
Chris@210
|
109 # * +code+ is the input String and is handled by the superclass
|
Chris@210
|
110 # StringScanner.
|
Chris@210
|
111 # * +options+ is a Hash with Symbols as keys.
|
Chris@210
|
112 # It is merged with the default options of the class (you can
|
Chris@210
|
113 # overwrite default options here.)
|
Chris@210
|
114 # * +block+ is the callback for streamed highlighting.
|
Chris@210
|
115 #
|
Chris@210
|
116 # If you set :stream to +true+ in the options, the Scanner uses a
|
Chris@210
|
117 # TokenStream with the +block+ as callback to handle the tokens.
|
Chris@210
|
118 #
|
Chris@210
|
119 # Else, a Tokens object is used.
|
Chris@210
|
120 def initialize code='', options = {}, &block
|
Chris@210
|
121 raise "I am only the basic Scanner class. I can't scan "\
|
Chris@210
|
122 "anything. :( Use my subclasses." if self.class == Scanner
|
Chris@210
|
123
|
Chris@210
|
124 @options = self.class::DEFAULT_OPTIONS.merge options
|
Chris@210
|
125
|
Chris@210
|
126 super Scanner.normify(code)
|
Chris@210
|
127
|
Chris@210
|
128 @tokens = options[:tokens]
|
Chris@210
|
129 if @options[:stream]
|
Chris@210
|
130 warn "warning in CodeRay::Scanner.new: :stream is set, "\
|
Chris@210
|
131 "but no block was given" unless block_given?
|
Chris@210
|
132 raise NotStreamableError, self unless kind_of? Streamable
|
Chris@210
|
133 @tokens ||= TokenStream.new(&block)
|
Chris@210
|
134 else
|
Chris@210
|
135 warn "warning in CodeRay::Scanner.new: Block given, "\
|
Chris@210
|
136 "but :stream is #{@options[:stream]}" if block_given?
|
Chris@210
|
137 @tokens ||= Tokens.new
|
Chris@210
|
138 end
|
Chris@210
|
139 @tokens.scanner = self
|
Chris@210
|
140
|
Chris@210
|
141 setup
|
Chris@210
|
142 end
|
Chris@210
|
143
|
Chris@210
|
144 def reset
|
Chris@210
|
145 super
|
Chris@210
|
146 reset_instance
|
Chris@210
|
147 end
|
Chris@210
|
148
|
Chris@210
|
149 def string= code
|
Chris@210
|
150 code = Scanner.normify(code)
|
Chris@210
|
151 if defined?(RUBY_DESCRIPTION) && RUBY_DESCRIPTION['rubinius 1.0.1']
|
Chris@210
|
152 reset_state
|
Chris@210
|
153 @string = code
|
Chris@210
|
154 else
|
Chris@210
|
155 super code
|
Chris@210
|
156 end
|
Chris@210
|
157 reset_instance
|
Chris@210
|
158 end
|
Chris@210
|
159
|
Chris@210
|
160 # More mnemonic accessor name for the input string.
|
Chris@210
|
161 alias code string
|
Chris@210
|
162 alias code= string=
|
Chris@210
|
163
|
Chris@210
|
164 # Returns the Plugin ID for this scanner.
|
Chris@210
|
165 def lang
|
Chris@210
|
166 self.class.plugin_id
|
Chris@210
|
167 end
|
Chris@210
|
168
|
Chris@210
|
169 # Scans the code and returns all tokens in a Tokens object.
|
Chris@210
|
170 def tokenize new_string=nil, options = {}
|
Chris@210
|
171 options = @options.merge(options)
|
Chris@210
|
172 self.string = new_string if new_string
|
Chris@210
|
173 @cached_tokens =
|
Chris@210
|
174 if @options[:stream] # :stream must have been set already
|
Chris@210
|
175 reset unless new_string
|
Chris@210
|
176 scan_tokens @tokens, options
|
Chris@210
|
177 @tokens
|
Chris@210
|
178 else
|
Chris@210
|
179 scan_tokens @tokens, options
|
Chris@210
|
180 end
|
Chris@210
|
181 end
|
Chris@210
|
182
|
Chris@210
|
183 def tokens
|
Chris@210
|
184 @cached_tokens ||= tokenize
|
Chris@210
|
185 end
|
Chris@210
|
186
|
Chris@210
|
187 # Whether the scanner is in streaming mode.
|
Chris@210
|
188 def streaming?
|
Chris@210
|
189 !!@options[:stream]
|
Chris@210
|
190 end
|
Chris@210
|
191
|
Chris@210
|
192 # Traverses the tokens.
|
Chris@210
|
193 def each &block
|
Chris@210
|
194 raise ArgumentError,
|
Chris@210
|
195 'Cannot traverse TokenStream.' if @options[:stream]
|
Chris@210
|
196 tokens.each(&block)
|
Chris@210
|
197 end
|
Chris@210
|
198 include Enumerable
|
Chris@210
|
199
|
Chris@210
|
200 # The current line position of the scanner.
|
Chris@210
|
201 #
|
Chris@210
|
202 # Beware, this is implemented inefficiently. It should be used
|
Chris@210
|
203 # for debugging only.
|
Chris@210
|
204 def line
|
Chris@210
|
205 string[0..pos].count("\n") + 1
|
Chris@210
|
206 end
|
Chris@210
|
207
|
Chris@210
|
208 def column pos = self.pos
|
Chris@210
|
209 return 0 if pos <= 0
|
Chris@210
|
210 string = string()
|
Chris@210
|
211 if string.respond_to?(:bytesize) && (defined?(@bin_string) || string.bytesize != string.size)
|
Chris@210
|
212 @bin_string ||= string.dup.force_encoding('binary')
|
Chris@210
|
213 string = @bin_string
|
Chris@210
|
214 end
|
Chris@210
|
215 pos - (string.rindex(?\n, pos) || 0)
|
Chris@210
|
216 end
|
Chris@210
|
217
|
Chris@210
|
218 def marshal_dump
|
Chris@210
|
219 @options
|
Chris@210
|
220 end
|
Chris@210
|
221
|
Chris@210
|
222 def marshal_load options
|
Chris@210
|
223 @options = options
|
Chris@210
|
224 end
|
Chris@210
|
225
|
Chris@210
|
226 protected
|
Chris@210
|
227
|
Chris@210
|
228 # Can be implemented by subclasses to do some initialization
|
Chris@210
|
229 # that has to be done once per instance.
|
Chris@210
|
230 #
|
Chris@210
|
231 # Use reset for initialization that has to be done once per
|
Chris@210
|
232 # scan.
|
Chris@210
|
233 def setup
|
Chris@210
|
234 end
|
Chris@210
|
235
|
Chris@210
|
236 # This is the central method, and commonly the only one a
|
Chris@210
|
237 # subclass implements.
|
Chris@210
|
238 #
|
Chris@210
|
239 # Subclasses must implement this method; it must return +tokens+
|
Chris@210
|
240 # and must only use Tokens#<< for storing scanned tokens!
|
Chris@210
|
241 def scan_tokens tokens, options
|
Chris@210
|
242 raise NotImplementedError,
|
Chris@210
|
243 "#{self.class}#scan_tokens not implemented."
|
Chris@210
|
244 end
|
Chris@210
|
245
|
Chris@210
|
246 def reset_instance
|
Chris@210
|
247 @tokens.clear unless @options[:keep_tokens]
|
Chris@210
|
248 @cached_tokens = nil
|
Chris@210
|
249 @bin_string = nil if defined? @bin_string
|
Chris@210
|
250 end
|
Chris@210
|
251
|
Chris@210
|
252 # Scanner error with additional status information
|
Chris@210
|
253 def raise_inspect msg, tokens, state = 'No state given!', ambit = 30
|
Chris@210
|
254 raise ScanError, <<-EOE % [
|
Chris@210
|
255
|
Chris@210
|
256
|
Chris@210
|
257 ***ERROR in %s: %s (after %d tokens)
|
Chris@210
|
258
|
Chris@210
|
259 tokens:
|
Chris@210
|
260 %s
|
Chris@210
|
261
|
Chris@210
|
262 current line: %d column: %d pos: %d
|
Chris@210
|
263 matched: %p state: %p
|
Chris@210
|
264 bol? = %p, eos? = %p
|
Chris@210
|
265
|
Chris@210
|
266 surrounding code:
|
Chris@210
|
267 %p ~~ %p
|
Chris@210
|
268
|
Chris@210
|
269
|
Chris@210
|
270 ***ERROR***
|
Chris@210
|
271
|
Chris@210
|
272 EOE
|
Chris@210
|
273 File.basename(caller[0]),
|
Chris@210
|
274 msg,
|
Chris@210
|
275 tokens.size,
|
Chris@210
|
276 tokens.last(10).map { |t| t.inspect }.join("\n"),
|
Chris@210
|
277 line, column, pos,
|
Chris@210
|
278 matched, state, bol?, eos?,
|
Chris@210
|
279 string[pos - ambit, ambit],
|
Chris@210
|
280 string[pos, ambit],
|
Chris@210
|
281 ]
|
Chris@210
|
282 end
|
Chris@210
|
283
|
Chris@210
|
284 end
|
Chris@210
|
285
|
Chris@210
|
286 end
|
Chris@210
|
287 end
|
Chris@210
|
288
|
Chris@210
|
289 class String
|
Chris@210
|
290 # I love this hack. It seems to silence all dos/unix/mac newline problems.
|
Chris@210
|
291 def to_unix
|
Chris@210
|
292 if index ?\r
|
Chris@210
|
293 gsub(/\r\n?/, "\n")
|
Chris@210
|
294 else
|
Chris@210
|
295 self
|
Chris@210
|
296 end
|
Chris@210
|
297 end
|
Chris@210
|
298 end
|