comparison vendor/gems/coderay-0.9.7/lib/coderay/tokens.rb @ 210:0579821a129a

Update to Redmine trunk rev 4802
author Chris Cannam
date Tue, 08 Feb 2011 13:51:46 +0000
parents
children
comparison
equal deleted inserted replaced
128:07fa8a8b56a8 210:0579821a129a
1 module CodeRay
2
3 # = Tokens
4 #
5 # The Tokens class represents a list of tokens returnd from
6 # a Scanner.
7 #
8 # A token is not a special object, just a two-element Array
9 # consisting of
10 # * the _token_ _text_ (the original source of the token in a String) or
11 # a _token_ _action_ (:open, :close, :begin_line, :end_line)
12 # * the _token_ _kind_ (a Symbol representing the type of the token)
13 #
14 # A token looks like this:
15 #
16 # ['# It looks like this', :comment]
17 # ['3.1415926', :float]
18 # ['$^', :error]
19 #
20 # Some scanners also yield sub-tokens, represented by special
21 # token actions, namely :open and :close.
22 #
23 # The Ruby scanner, for example, splits "a string" into:
24 #
25 # [
26 # [:open, :string],
27 # ['"', :delimiter],
28 # ['a string', :content],
29 # ['"', :delimiter],
30 # [:close, :string]
31 # ]
32 #
33 # Tokens is the interface between Scanners and Encoders:
34 # The input is split and saved into a Tokens object. The Encoder
35 # then builds the output from this object.
36 #
37 # Thus, the syntax below becomes clear:
38 #
39 # CodeRay.scan('price = 2.59', :ruby).html
40 # # the Tokens object is here -------^
41 #
42 # See how small it is? ;)
43 #
44 # Tokens gives you the power to handle pre-scanned code very easily:
45 # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
46 # that you put in your DB.
47 #
48 # It also allows you to generate tokens directly (without using a scanner),
49 # to load them from a file, and still use any Encoder that CodeRay provides.
50 #
51 # Tokens' subclass TokenStream allows streaming to save memory.
52 class Tokens < Array
53
54 # The Scanner instance that created the tokens.
55 attr_accessor :scanner
56
57 # Whether the object is a TokenStream.
58 #
59 # Returns false.
60 def stream?
61 false
62 end
63
64 # Iterates over all tokens.
65 #
66 # If a filter is given, only tokens of that kind are yielded.
67 def each kind_filter = nil, &block
68 unless kind_filter
69 super(&block)
70 else
71 super() do |text, kind|
72 next unless kind == kind_filter
73 yield text, kind
74 end
75 end
76 end
77
78 # Iterates over all text tokens.
79 # Range tokens like [:open, :string] are left out.
80 #
81 # Example:
82 # tokens.each_text_token { |text, kind| text.replace html_escape(text) }
83 def each_text_token
84 each do |text, kind|
85 next unless text.is_a? ::String
86 yield text, kind
87 end
88 end
89
90 # Encode the tokens using encoder.
91 #
92 # encoder can be
93 # * a symbol like :html oder :statistic
94 # * an Encoder class
95 # * an Encoder object
96 #
97 # options are passed to the encoder.
98 def encode encoder, options = {}
99 unless encoder.is_a? Encoders::Encoder
100 unless encoder.is_a? Class
101 encoder_class = Encoders[encoder]
102 end
103 encoder = encoder_class.new options
104 end
105 encoder.encode_tokens self, options
106 end
107
108
109 # Turn into a string using Encoders::Text.
110 #
111 # +options+ are passed to the encoder if given.
112 def to_s options = {}
113 encode :text, options
114 end
115
116 # Redirects unknown methods to encoder calls.
117 #
118 # For example, if you call +tokens.html+, the HTML encoder
119 # is used to highlight the tokens.
120 def method_missing meth, options = {}
121 Encoders[meth].new(options).encode_tokens self
122 end
123
124 # Returns the tokens compressed by joining consecutive
125 # tokens of the same kind.
126 #
127 # This can not be undone, but should yield the same output
128 # in most Encoders. It basically makes the output smaller.
129 #
130 # Combined with dump, it saves space for the cost of time.
131 #
132 # If the scanner is written carefully, this is not required -
133 # for example, consecutive //-comment lines could already be
134 # joined in one comment token by the Scanner.
135 def optimize
136 last_kind = last_text = nil
137 new = self.class.new
138 for text, kind in self
139 if text.is_a? String
140 if kind == last_kind
141 last_text << text
142 else
143 new << [last_text, last_kind] if last_kind
144 last_text = text
145 last_kind = kind
146 end
147 else
148 new << [last_text, last_kind] if last_kind
149 last_kind = last_text = nil
150 new << [text, kind]
151 end
152 end
153 new << [last_text, last_kind] if last_kind
154 new
155 end
156
157 # Compact the object itself; see optimize.
158 def optimize!
159 replace optimize
160 end
161
162 # Ensure that all :open tokens have a correspondent :close one.
163 #
164 # TODO: Test this!
165 def fix
166 tokens = self.class.new
167 # Check token nesting using a stack of kinds.
168 opened = []
169 for type, kind in self
170 case type
171 when :open
172 opened.push [:close, kind]
173 when :begin_line
174 opened.push [:end_line, kind]
175 when :close, :end_line
176 expected = opened.pop
177 if [type, kind] != expected
178 # Unexpected :close; decide what to do based on the kind:
179 # - token was never opened: delete the :close (just skip it)
180 next unless opened.rindex expected
181 # - token was opened earlier: also close tokens in between
182 tokens << token until (token = opened.pop) == expected
183 end
184 end
185 tokens << [type, kind]
186 end
187 # Close remaining opened tokens
188 tokens << token while token = opened.pop
189 tokens
190 end
191
192 def fix!
193 replace fix
194 end
195
196 # TODO: Scanner#split_into_lines
197 #
198 # Makes sure that:
199 # - newlines are single tokens
200 # (which means all other token are single-line)
201 # - there are no open tokens at the end the line
202 #
203 # This makes it simple for encoders that work line-oriented,
204 # like HTML with list-style numeration.
205 def split_into_lines
206 raise NotImplementedError
207 end
208
209 def split_into_lines!
210 replace split_into_lines
211 end
212
213 # Dumps the object into a String that can be saved
214 # in files or databases.
215 #
216 # The dump is created with Marshal.dump;
217 # In addition, it is gzipped using GZip.gzip.
218 #
219 # The returned String object includes Undumping
220 # so it has an #undump method. See Tokens.load.
221 #
222 # You can configure the level of compression,
223 # but the default value 7 should be what you want
224 # in most cases as it is a good compromise between
225 # speed and compression rate.
226 #
227 # See GZip module.
228 def dump gzip_level = 7
229 require 'coderay/helpers/gzip_simple'
230 dump = Marshal.dump self
231 dump = dump.gzip gzip_level
232 dump.extend Undumping
233 end
234
235 # The total size of the tokens.
236 # Should be equal to the input size before
237 # scanning.
238 def text_size
239 size = 0
240 each_text_token do |t, k|
241 size + t.size
242 end
243 size
244 end
245
246 # Return all text tokens joined into a single string.
247 def text
248 map { |t, k| t if t.is_a? ::String }.join
249 end
250
251 # Include this module to give an object an #undump
252 # method.
253 #
254 # The string returned by Tokens.dump includes Undumping.
255 module Undumping
256 # Calls Tokens.load with itself.
257 def undump
258 Tokens.load self
259 end
260 end
261
262 # Undump the object using Marshal.load, then
263 # unzip it using GZip.gunzip.
264 #
265 # The result is commonly a Tokens object, but
266 # this is not guaranteed.
267 def Tokens.load dump
268 require 'coderay/helpers/gzip_simple'
269 dump = dump.gunzip
270 @dump = Marshal.load dump
271 end
272
273 end
274
275
276 # = TokenStream
277 #
278 # The TokenStream class is a fake Array without elements.
279 #
280 # It redirects the method << to a block given at creation.
281 #
282 # This allows scanners and Encoders to use streaming (no
283 # tokens are saved, the input is highlighted the same time it
284 # is scanned) with the same code.
285 #
286 # See CodeRay.encode_stream and CodeRay.scan_stream
287 class TokenStream < Tokens
288
289 # Whether the object is a TokenStream.
290 #
291 # Returns true.
292 def stream?
293 true
294 end
295
296 # The Array is empty, but size counts the tokens given by <<.
297 attr_reader :size
298
299 # Creates a new TokenStream that calls +block+ whenever
300 # its << method is called.
301 #
302 # Example:
303 #
304 # require 'coderay'
305 #
306 # token_stream = CodeRay::TokenStream.new do |text, kind|
307 # puts 'kind: %s, text size: %d.' % [kind, text.size]
308 # end
309 #
310 # token_stream << ['/\d+/', :regexp]
311 # #-> kind: rexpexp, text size: 5.
312 #
313 def initialize &block
314 raise ArgumentError, 'Block expected for streaming.' unless block
315 @callback = block
316 @size = 0
317 end
318
319 # Calls +block+ with +token+ and increments size.
320 #
321 # Returns self.
322 def << token
323 @callback.call(*token)
324 @size += 1
325 self
326 end
327
328 # This method is not implemented due to speed reasons. Use Tokens.
329 def text_size
330 raise NotImplementedError,
331 'This method is not implemented due to speed reasons.'
332 end
333
334 # A TokenStream cannot be dumped. Use Tokens.
335 def dump
336 raise NotImplementedError, 'A TokenStream cannot be dumped.'
337 end
338
339 # A TokenStream cannot be optimized. Use Tokens.
340 def optimize
341 raise NotImplementedError, 'A TokenStream cannot be optimized.'
342 end
343
344 end
345
346 end
347
348 if $0 == __FILE__
349 $VERBOSE = true
350 $: << File.join(File.dirname(__FILE__), '..')
351 eval DATA.read, nil, $0, __LINE__ + 4
352 end
353
354 __END__
355 require 'test/unit'
356
357 class TokensTest < Test::Unit::TestCase
358
359 def test_creation
360 assert CodeRay::Tokens < Array
361 tokens = nil
362 assert_nothing_raised do
363 tokens = CodeRay::Tokens.new
364 end
365 assert_kind_of Array, tokens
366 end
367
368 def test_adding_tokens
369 tokens = CodeRay::Tokens.new
370 assert_nothing_raised do
371 tokens << ['string', :type]
372 tokens << ['()', :operator]
373 end
374 assert_equal tokens.size, 2
375 end
376
377 def test_dump_undump
378 tokens = CodeRay::Tokens.new
379 assert_nothing_raised do
380 tokens << ['string', :type]
381 tokens << ['()', :operator]
382 end
383 tokens2 = nil
384 assert_nothing_raised do
385 tokens2 = tokens.dump.undump
386 end
387 assert_equal tokens, tokens2
388 end
389
390 end