Chris@210
|
1 module CodeRay
|
Chris@210
|
2
|
Chris@210
|
3 # = Tokens
|
Chris@210
|
4 #
|
Chris@210
|
5 # The Tokens class represents a list of tokens returnd from
|
Chris@210
|
6 # a Scanner.
|
Chris@210
|
7 #
|
Chris@210
|
8 # A token is not a special object, just a two-element Array
|
Chris@210
|
9 # consisting of
|
Chris@210
|
10 # * the _token_ _text_ (the original source of the token in a String) or
|
Chris@210
|
11 # a _token_ _action_ (:open, :close, :begin_line, :end_line)
|
Chris@210
|
12 # * the _token_ _kind_ (a Symbol representing the type of the token)
|
Chris@210
|
13 #
|
Chris@210
|
14 # A token looks like this:
|
Chris@210
|
15 #
|
Chris@210
|
16 # ['# It looks like this', :comment]
|
Chris@210
|
17 # ['3.1415926', :float]
|
Chris@210
|
18 # ['$^', :error]
|
Chris@210
|
19 #
|
Chris@210
|
20 # Some scanners also yield sub-tokens, represented by special
|
Chris@210
|
21 # token actions, namely :open and :close.
|
Chris@210
|
22 #
|
Chris@210
|
23 # The Ruby scanner, for example, splits "a string" into:
|
Chris@210
|
24 #
|
Chris@210
|
25 # [
|
Chris@210
|
26 # [:open, :string],
|
Chris@210
|
27 # ['"', :delimiter],
|
Chris@210
|
28 # ['a string', :content],
|
Chris@210
|
29 # ['"', :delimiter],
|
Chris@210
|
30 # [:close, :string]
|
Chris@210
|
31 # ]
|
Chris@210
|
32 #
|
Chris@210
|
33 # Tokens is the interface between Scanners and Encoders:
|
Chris@210
|
34 # The input is split and saved into a Tokens object. The Encoder
|
Chris@210
|
35 # then builds the output from this object.
|
Chris@210
|
36 #
|
Chris@210
|
37 # Thus, the syntax below becomes clear:
|
Chris@210
|
38 #
|
Chris@210
|
39 # CodeRay.scan('price = 2.59', :ruby).html
|
Chris@210
|
40 # # the Tokens object is here -------^
|
Chris@210
|
41 #
|
Chris@210
|
42 # See how small it is? ;)
|
Chris@210
|
43 #
|
Chris@210
|
44 # Tokens gives you the power to handle pre-scanned code very easily:
|
Chris@210
|
45 # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
|
Chris@210
|
46 # that you put in your DB.
|
Chris@210
|
47 #
|
Chris@210
|
48 # It also allows you to generate tokens directly (without using a scanner),
|
Chris@210
|
49 # to load them from a file, and still use any Encoder that CodeRay provides.
|
Chris@210
|
50 #
|
Chris@210
|
51 # Tokens' subclass TokenStream allows streaming to save memory.
|
Chris@210
|
52 class Tokens < Array
|
Chris@210
|
53
|
Chris@210
|
54 # The Scanner instance that created the tokens.
|
Chris@210
|
55 attr_accessor :scanner
|
Chris@210
|
56
|
Chris@210
|
57 # Whether the object is a TokenStream.
|
Chris@210
|
58 #
|
Chris@210
|
59 # Returns false.
|
Chris@210
|
60 def stream?
|
Chris@210
|
61 false
|
Chris@210
|
62 end
|
Chris@210
|
63
|
Chris@210
|
64 # Iterates over all tokens.
|
Chris@210
|
65 #
|
Chris@210
|
66 # If a filter is given, only tokens of that kind are yielded.
|
Chris@210
|
67 def each kind_filter = nil, &block
|
Chris@210
|
68 unless kind_filter
|
Chris@210
|
69 super(&block)
|
Chris@210
|
70 else
|
Chris@210
|
71 super() do |text, kind|
|
Chris@210
|
72 next unless kind == kind_filter
|
Chris@210
|
73 yield text, kind
|
Chris@210
|
74 end
|
Chris@210
|
75 end
|
Chris@210
|
76 end
|
Chris@210
|
77
|
Chris@210
|
78 # Iterates over all text tokens.
|
Chris@210
|
79 # Range tokens like [:open, :string] are left out.
|
Chris@210
|
80 #
|
Chris@210
|
81 # Example:
|
Chris@210
|
82 # tokens.each_text_token { |text, kind| text.replace html_escape(text) }
|
Chris@210
|
83 def each_text_token
|
Chris@210
|
84 each do |text, kind|
|
Chris@210
|
85 next unless text.is_a? ::String
|
Chris@210
|
86 yield text, kind
|
Chris@210
|
87 end
|
Chris@210
|
88 end
|
Chris@210
|
89
|
Chris@210
|
90 # Encode the tokens using encoder.
|
Chris@210
|
91 #
|
Chris@210
|
92 # encoder can be
|
Chris@210
|
93 # * a symbol like :html oder :statistic
|
Chris@210
|
94 # * an Encoder class
|
Chris@210
|
95 # * an Encoder object
|
Chris@210
|
96 #
|
Chris@210
|
97 # options are passed to the encoder.
|
Chris@210
|
98 def encode encoder, options = {}
|
Chris@210
|
99 unless encoder.is_a? Encoders::Encoder
|
Chris@210
|
100 unless encoder.is_a? Class
|
Chris@210
|
101 encoder_class = Encoders[encoder]
|
Chris@210
|
102 end
|
Chris@210
|
103 encoder = encoder_class.new options
|
Chris@210
|
104 end
|
Chris@210
|
105 encoder.encode_tokens self, options
|
Chris@210
|
106 end
|
Chris@210
|
107
|
Chris@210
|
108
|
Chris@210
|
109 # Turn into a string using Encoders::Text.
|
Chris@210
|
110 #
|
Chris@210
|
111 # +options+ are passed to the encoder if given.
|
Chris@210
|
112 def to_s options = {}
|
Chris@210
|
113 encode :text, options
|
Chris@210
|
114 end
|
Chris@210
|
115
|
Chris@210
|
116 # Redirects unknown methods to encoder calls.
|
Chris@210
|
117 #
|
Chris@210
|
118 # For example, if you call +tokens.html+, the HTML encoder
|
Chris@210
|
119 # is used to highlight the tokens.
|
Chris@210
|
120 def method_missing meth, options = {}
|
Chris@210
|
121 Encoders[meth].new(options).encode_tokens self
|
Chris@210
|
122 end
|
Chris@210
|
123
|
Chris@210
|
124 # Returns the tokens compressed by joining consecutive
|
Chris@210
|
125 # tokens of the same kind.
|
Chris@210
|
126 #
|
Chris@210
|
127 # This can not be undone, but should yield the same output
|
Chris@210
|
128 # in most Encoders. It basically makes the output smaller.
|
Chris@210
|
129 #
|
Chris@210
|
130 # Combined with dump, it saves space for the cost of time.
|
Chris@210
|
131 #
|
Chris@210
|
132 # If the scanner is written carefully, this is not required -
|
Chris@210
|
133 # for example, consecutive //-comment lines could already be
|
Chris@210
|
134 # joined in one comment token by the Scanner.
|
Chris@210
|
135 def optimize
|
Chris@210
|
136 last_kind = last_text = nil
|
Chris@210
|
137 new = self.class.new
|
Chris@210
|
138 for text, kind in self
|
Chris@210
|
139 if text.is_a? String
|
Chris@210
|
140 if kind == last_kind
|
Chris@210
|
141 last_text << text
|
Chris@210
|
142 else
|
Chris@210
|
143 new << [last_text, last_kind] if last_kind
|
Chris@210
|
144 last_text = text
|
Chris@210
|
145 last_kind = kind
|
Chris@210
|
146 end
|
Chris@210
|
147 else
|
Chris@210
|
148 new << [last_text, last_kind] if last_kind
|
Chris@210
|
149 last_kind = last_text = nil
|
Chris@210
|
150 new << [text, kind]
|
Chris@210
|
151 end
|
Chris@210
|
152 end
|
Chris@210
|
153 new << [last_text, last_kind] if last_kind
|
Chris@210
|
154 new
|
Chris@210
|
155 end
|
Chris@210
|
156
|
Chris@210
|
157 # Compact the object itself; see optimize.
|
Chris@210
|
158 def optimize!
|
Chris@210
|
159 replace optimize
|
Chris@210
|
160 end
|
Chris@210
|
161
|
Chris@210
|
162 # Ensure that all :open tokens have a correspondent :close one.
|
Chris@210
|
163 #
|
Chris@210
|
164 # TODO: Test this!
|
Chris@210
|
165 def fix
|
Chris@210
|
166 tokens = self.class.new
|
Chris@210
|
167 # Check token nesting using a stack of kinds.
|
Chris@210
|
168 opened = []
|
Chris@210
|
169 for type, kind in self
|
Chris@210
|
170 case type
|
Chris@210
|
171 when :open
|
Chris@210
|
172 opened.push [:close, kind]
|
Chris@210
|
173 when :begin_line
|
Chris@210
|
174 opened.push [:end_line, kind]
|
Chris@210
|
175 when :close, :end_line
|
Chris@210
|
176 expected = opened.pop
|
Chris@210
|
177 if [type, kind] != expected
|
Chris@210
|
178 # Unexpected :close; decide what to do based on the kind:
|
Chris@210
|
179 # - token was never opened: delete the :close (just skip it)
|
Chris@210
|
180 next unless opened.rindex expected
|
Chris@210
|
181 # - token was opened earlier: also close tokens in between
|
Chris@210
|
182 tokens << token until (token = opened.pop) == expected
|
Chris@210
|
183 end
|
Chris@210
|
184 end
|
Chris@210
|
185 tokens << [type, kind]
|
Chris@210
|
186 end
|
Chris@210
|
187 # Close remaining opened tokens
|
Chris@210
|
188 tokens << token while token = opened.pop
|
Chris@210
|
189 tokens
|
Chris@210
|
190 end
|
Chris@210
|
191
|
Chris@210
|
192 def fix!
|
Chris@210
|
193 replace fix
|
Chris@210
|
194 end
|
Chris@210
|
195
|
Chris@210
|
196 # TODO: Scanner#split_into_lines
|
Chris@210
|
197 #
|
Chris@210
|
198 # Makes sure that:
|
Chris@210
|
199 # - newlines are single tokens
|
Chris@210
|
200 # (which means all other token are single-line)
|
Chris@210
|
201 # - there are no open tokens at the end the line
|
Chris@210
|
202 #
|
Chris@210
|
203 # This makes it simple for encoders that work line-oriented,
|
Chris@210
|
204 # like HTML with list-style numeration.
|
Chris@210
|
205 def split_into_lines
|
Chris@210
|
206 raise NotImplementedError
|
Chris@210
|
207 end
|
Chris@210
|
208
|
Chris@210
|
209 def split_into_lines!
|
Chris@210
|
210 replace split_into_lines
|
Chris@210
|
211 end
|
Chris@210
|
212
|
Chris@210
|
213 # Dumps the object into a String that can be saved
|
Chris@210
|
214 # in files or databases.
|
Chris@210
|
215 #
|
Chris@210
|
216 # The dump is created with Marshal.dump;
|
Chris@210
|
217 # In addition, it is gzipped using GZip.gzip.
|
Chris@210
|
218 #
|
Chris@210
|
219 # The returned String object includes Undumping
|
Chris@210
|
220 # so it has an #undump method. See Tokens.load.
|
Chris@210
|
221 #
|
Chris@210
|
222 # You can configure the level of compression,
|
Chris@210
|
223 # but the default value 7 should be what you want
|
Chris@210
|
224 # in most cases as it is a good compromise between
|
Chris@210
|
225 # speed and compression rate.
|
Chris@210
|
226 #
|
Chris@210
|
227 # See GZip module.
|
Chris@210
|
228 def dump gzip_level = 7
|
Chris@210
|
229 require 'coderay/helpers/gzip_simple'
|
Chris@210
|
230 dump = Marshal.dump self
|
Chris@210
|
231 dump = dump.gzip gzip_level
|
Chris@210
|
232 dump.extend Undumping
|
Chris@210
|
233 end
|
Chris@210
|
234
|
Chris@210
|
235 # The total size of the tokens.
|
Chris@210
|
236 # Should be equal to the input size before
|
Chris@210
|
237 # scanning.
|
Chris@210
|
238 def text_size
|
Chris@210
|
239 size = 0
|
Chris@210
|
240 each_text_token do |t, k|
|
Chris@210
|
241 size + t.size
|
Chris@210
|
242 end
|
Chris@210
|
243 size
|
Chris@210
|
244 end
|
Chris@210
|
245
|
Chris@210
|
246 # Return all text tokens joined into a single string.
|
Chris@210
|
247 def text
|
Chris@210
|
248 map { |t, k| t if t.is_a? ::String }.join
|
Chris@210
|
249 end
|
Chris@210
|
250
|
Chris@210
|
251 # Include this module to give an object an #undump
|
Chris@210
|
252 # method.
|
Chris@210
|
253 #
|
Chris@210
|
254 # The string returned by Tokens.dump includes Undumping.
|
Chris@210
|
255 module Undumping
|
Chris@210
|
256 # Calls Tokens.load with itself.
|
Chris@210
|
257 def undump
|
Chris@210
|
258 Tokens.load self
|
Chris@210
|
259 end
|
Chris@210
|
260 end
|
Chris@210
|
261
|
Chris@210
|
262 # Undump the object using Marshal.load, then
|
Chris@210
|
263 # unzip it using GZip.gunzip.
|
Chris@210
|
264 #
|
Chris@210
|
265 # The result is commonly a Tokens object, but
|
Chris@210
|
266 # this is not guaranteed.
|
Chris@210
|
267 def Tokens.load dump
|
Chris@210
|
268 require 'coderay/helpers/gzip_simple'
|
Chris@210
|
269 dump = dump.gunzip
|
Chris@210
|
270 @dump = Marshal.load dump
|
Chris@210
|
271 end
|
Chris@210
|
272
|
Chris@210
|
273 end
|
Chris@210
|
274
|
Chris@210
|
275
|
Chris@210
|
276 # = TokenStream
|
Chris@210
|
277 #
|
Chris@210
|
278 # The TokenStream class is a fake Array without elements.
|
Chris@210
|
279 #
|
Chris@210
|
280 # It redirects the method << to a block given at creation.
|
Chris@210
|
281 #
|
Chris@210
|
282 # This allows scanners and Encoders to use streaming (no
|
Chris@210
|
283 # tokens are saved, the input is highlighted the same time it
|
Chris@210
|
284 # is scanned) with the same code.
|
Chris@210
|
285 #
|
Chris@210
|
286 # See CodeRay.encode_stream and CodeRay.scan_stream
|
Chris@210
|
287 class TokenStream < Tokens
|
Chris@210
|
288
|
Chris@210
|
289 # Whether the object is a TokenStream.
|
Chris@210
|
290 #
|
Chris@210
|
291 # Returns true.
|
Chris@210
|
292 def stream?
|
Chris@210
|
293 true
|
Chris@210
|
294 end
|
Chris@210
|
295
|
Chris@210
|
296 # The Array is empty, but size counts the tokens given by <<.
|
Chris@210
|
297 attr_reader :size
|
Chris@210
|
298
|
Chris@210
|
299 # Creates a new TokenStream that calls +block+ whenever
|
Chris@210
|
300 # its << method is called.
|
Chris@210
|
301 #
|
Chris@210
|
302 # Example:
|
Chris@210
|
303 #
|
Chris@210
|
304 # require 'coderay'
|
Chris@210
|
305 #
|
Chris@210
|
306 # token_stream = CodeRay::TokenStream.new do |text, kind|
|
Chris@210
|
307 # puts 'kind: %s, text size: %d.' % [kind, text.size]
|
Chris@210
|
308 # end
|
Chris@210
|
309 #
|
Chris@210
|
310 # token_stream << ['/\d+/', :regexp]
|
Chris@210
|
311 # #-> kind: rexpexp, text size: 5.
|
Chris@210
|
312 #
|
Chris@210
|
313 def initialize &block
|
Chris@210
|
314 raise ArgumentError, 'Block expected for streaming.' unless block
|
Chris@210
|
315 @callback = block
|
Chris@210
|
316 @size = 0
|
Chris@210
|
317 end
|
Chris@210
|
318
|
Chris@210
|
319 # Calls +block+ with +token+ and increments size.
|
Chris@210
|
320 #
|
Chris@210
|
321 # Returns self.
|
Chris@210
|
322 def << token
|
Chris@210
|
323 @callback.call(*token)
|
Chris@210
|
324 @size += 1
|
Chris@210
|
325 self
|
Chris@210
|
326 end
|
Chris@210
|
327
|
Chris@210
|
328 # This method is not implemented due to speed reasons. Use Tokens.
|
Chris@210
|
329 def text_size
|
Chris@210
|
330 raise NotImplementedError,
|
Chris@210
|
331 'This method is not implemented due to speed reasons.'
|
Chris@210
|
332 end
|
Chris@210
|
333
|
Chris@210
|
334 # A TokenStream cannot be dumped. Use Tokens.
|
Chris@210
|
335 def dump
|
Chris@210
|
336 raise NotImplementedError, 'A TokenStream cannot be dumped.'
|
Chris@210
|
337 end
|
Chris@210
|
338
|
Chris@210
|
339 # A TokenStream cannot be optimized. Use Tokens.
|
Chris@210
|
340 def optimize
|
Chris@210
|
341 raise NotImplementedError, 'A TokenStream cannot be optimized.'
|
Chris@210
|
342 end
|
Chris@210
|
343
|
Chris@210
|
344 end
|
Chris@210
|
345
|
Chris@210
|
346 end
|
Chris@210
|
347
|
Chris@210
|
348 if $0 == __FILE__
|
Chris@210
|
349 $VERBOSE = true
|
Chris@210
|
350 $: << File.join(File.dirname(__FILE__), '..')
|
Chris@210
|
351 eval DATA.read, nil, $0, __LINE__ + 4
|
Chris@210
|
352 end
|
Chris@210
|
353
|
Chris@210
|
354 __END__
|
Chris@210
|
355 require 'test/unit'
|
Chris@210
|
356
|
Chris@210
|
357 class TokensTest < Test::Unit::TestCase
|
Chris@210
|
358
|
Chris@210
|
359 def test_creation
|
Chris@210
|
360 assert CodeRay::Tokens < Array
|
Chris@210
|
361 tokens = nil
|
Chris@210
|
362 assert_nothing_raised do
|
Chris@210
|
363 tokens = CodeRay::Tokens.new
|
Chris@210
|
364 end
|
Chris@210
|
365 assert_kind_of Array, tokens
|
Chris@210
|
366 end
|
Chris@210
|
367
|
Chris@210
|
368 def test_adding_tokens
|
Chris@210
|
369 tokens = CodeRay::Tokens.new
|
Chris@210
|
370 assert_nothing_raised do
|
Chris@210
|
371 tokens << ['string', :type]
|
Chris@210
|
372 tokens << ['()', :operator]
|
Chris@210
|
373 end
|
Chris@210
|
374 assert_equal tokens.size, 2
|
Chris@210
|
375 end
|
Chris@210
|
376
|
Chris@210
|
377 def test_dump_undump
|
Chris@210
|
378 tokens = CodeRay::Tokens.new
|
Chris@210
|
379 assert_nothing_raised do
|
Chris@210
|
380 tokens << ['string', :type]
|
Chris@210
|
381 tokens << ['()', :operator]
|
Chris@210
|
382 end
|
Chris@210
|
383 tokens2 = nil
|
Chris@210
|
384 assert_nothing_raised do
|
Chris@210
|
385 tokens2 = tokens.dump.undump
|
Chris@210
|
386 end
|
Chris@210
|
387 assert_equal tokens, tokens2
|
Chris@210
|
388 end
|
Chris@210
|
389
|
Chris@210
|
390 end |