comparison vendor/plugins/coderay-0.9.2/lib/coderay/.svn/text-base/tokens.rb.svn-base @ 0:513646585e45

* Import Redmine trunk SVN rev 3859
author Chris Cannam
date Fri, 23 Jul 2010 15:52:44 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:513646585e45
1 module CodeRay
2
3 # = Tokens
4 #
5 # The Tokens class represents a list of tokens returnd from
6 # a Scanner.
7 #
8 # A token is not a special object, just a two-element Array
9 # consisting of
10 # * the _token_ _kind_ (a Symbol representing the type of the token)
11 # * the _token_ _text_ (the original source of the token in a String)
12 #
13 # A token looks like this:
14 #
15 # [:comment, '# It looks like this']
16 # [:float, '3.1415926']
17 # [:error, '$^']
18 #
19 # Some scanners also yield some kind of sub-tokens, represented by special
20 # token texts, namely :open and :close .
21 #
22 # The Ruby scanner, for example, splits "a string" into:
23 #
24 # [
25 # [:open, :string],
26 # [:delimiter, '"'],
27 # [:content, 'a string'],
28 # [:delimiter, '"'],
29 # [:close, :string]
30 # ]
31 #
32 # Tokens is also the interface between Scanners and Encoders:
33 # The input is split and saved into a Tokens object. The Encoder
34 # then builds the output from this object.
35 #
36 # Thus, the syntax below becomes clear:
37 #
38 # CodeRay.scan('price = 2.59', :ruby).html
39 # # the Tokens object is here -------^
40 #
41 # See how small it is? ;)
42 #
43 # Tokens gives you the power to handle pre-scanned code very easily:
44 # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
45 # that you put in your DB.
46 #
47 # Tokens' subclass TokenStream allows streaming to save memory.
48 class Tokens < Array
49
50 # The Scanner instance that created the tokens.
51 attr_accessor :scanner
52
53 # Whether the object is a TokenStream.
54 #
55 # Returns false.
56 def stream?
57 false
58 end
59
60 # Iterates over all tokens.
61 #
62 # If a filter is given, only tokens of that kind are yielded.
63 def each kind_filter = nil, &block
64 unless kind_filter
65 super(&block)
66 else
67 super() do |text, kind|
68 next unless kind == kind_filter
69 yield text, kind
70 end
71 end
72 end
73
74 # Iterates over all text tokens.
75 # Range tokens like [:open, :string] are left out.
76 #
77 # Example:
78 # tokens.each_text_token { |text, kind| text.replace html_escape(text) }
79 def each_text_token
80 each do |text, kind|
81 next unless text.is_a? ::String
82 yield text, kind
83 end
84 end
85
86 # Encode the tokens using encoder.
87 #
88 # encoder can be
89 # * a symbol like :html oder :statistic
90 # * an Encoder class
91 # * an Encoder object
92 #
93 # options are passed to the encoder.
94 def encode encoder, options = {}
95 unless encoder.is_a? Encoders::Encoder
96 unless encoder.is_a? Class
97 encoder_class = Encoders[encoder]
98 end
99 encoder = encoder_class.new options
100 end
101 encoder.encode_tokens self, options
102 end
103
104
105 # Turn into a string using Encoders::Text.
106 #
107 # +options+ are passed to the encoder if given.
108 def to_s options = {}
109 encode :text, options
110 end
111
112 # Redirects unknown methods to encoder calls.
113 #
114 # For example, if you call +tokens.html+, the HTML encoder
115 # is used to highlight the tokens.
116 def method_missing meth, options = {}
117 Encoders[meth].new(options).encode_tokens self
118 end
119
120 # Returns the tokens compressed by joining consecutive
121 # tokens of the same kind.
122 #
123 # This can not be undone, but should yield the same output
124 # in most Encoders. It basically makes the output smaller.
125 #
126 # Combined with dump, it saves space for the cost of time.
127 #
128 # If the scanner is written carefully, this is not required -
129 # for example, consecutive //-comment lines could already be
130 # joined in one comment token by the Scanner.
131 def optimize
132 last_kind = last_text = nil
133 new = self.class.new
134 for text, kind in self
135 if text.is_a? String
136 if kind == last_kind
137 last_text << text
138 else
139 new << [last_text, last_kind] if last_kind
140 last_text = text
141 last_kind = kind
142 end
143 else
144 new << [last_text, last_kind] if last_kind
145 last_kind = last_text = nil
146 new << [text, kind]
147 end
148 end
149 new << [last_text, last_kind] if last_kind
150 new
151 end
152
153 # Compact the object itself; see optimize.
154 def optimize!
155 replace optimize
156 end
157
158 # Ensure that all :open tokens have a correspondent :close one.
159 #
160 # TODO: Test this!
161 def fix
162 tokens = self.class.new
163 # Check token nesting using a stack of kinds.
164 opened = []
165 for type, kind in self
166 case type
167 when :open
168 opened.push [:close, kind]
169 when :begin_line
170 opened.push [:end_line, kind]
171 when :close, :end_line
172 expected = opened.pop
173 if [type, kind] != expected
174 # Unexpected :close; decide what to do based on the kind:
175 # - token was never opened: delete the :close (just skip it)
176 next unless opened.rindex expected
177 # - token was opened earlier: also close tokens in between
178 tokens << token until (token = opened.pop) == expected
179 end
180 end
181 tokens << [type, kind]
182 end
183 # Close remaining opened tokens
184 tokens << token while token = opened.pop
185 tokens
186 end
187
188 def fix!
189 replace fix
190 end
191
192 # TODO: Scanner#split_into_lines
193 #
194 # Makes sure that:
195 # - newlines are single tokens
196 # (which means all other token are single-line)
197 # - there are no open tokens at the end the line
198 #
199 # This makes it simple for encoders that work line-oriented,
200 # like HTML with list-style numeration.
201 def split_into_lines
202 raise NotImplementedError
203 end
204
205 def split_into_lines!
206 replace split_into_lines
207 end
208
209 # Dumps the object into a String that can be saved
210 # in files or databases.
211 #
212 # The dump is created with Marshal.dump;
213 # In addition, it is gzipped using GZip.gzip.
214 #
215 # The returned String object includes Undumping
216 # so it has an #undump method. See Tokens.load.
217 #
218 # You can configure the level of compression,
219 # but the default value 7 should be what you want
220 # in most cases as it is a good compromise between
221 # speed and compression rate.
222 #
223 # See GZip module.
224 def dump gzip_level = 7
225 require 'coderay/helpers/gzip_simple'
226 dump = Marshal.dump self
227 dump = dump.gzip gzip_level
228 dump.extend Undumping
229 end
230
231 # The total size of the tokens.
232 # Should be equal to the input size before
233 # scanning.
234 def text_size
235 size = 0
236 each_text_token do |t, k|
237 size + t.size
238 end
239 size
240 end
241
242 # The total size of the tokens.
243 # Should be equal to the input size before
244 # scanning.
245 def text
246 map { |t, k| t if t.is_a? ::String }.join
247 end
248
249 # Include this module to give an object an #undump
250 # method.
251 #
252 # The string returned by Tokens.dump includes Undumping.
253 module Undumping
254 # Calls Tokens.load with itself.
255 def undump
256 Tokens.load self
257 end
258 end
259
260 # Undump the object using Marshal.load, then
261 # unzip it using GZip.gunzip.
262 #
263 # The result is commonly a Tokens object, but
264 # this is not guaranteed.
265 def Tokens.load dump
266 require 'coderay/helpers/gzip_simple'
267 dump = dump.gunzip
268 @dump = Marshal.load dump
269 end
270
271 end
272
273
274 # = TokenStream
275 #
276 # The TokenStream class is a fake Array without elements.
277 #
278 # It redirects the method << to a block given at creation.
279 #
280 # This allows scanners and Encoders to use streaming (no
281 # tokens are saved, the input is highlighted the same time it
282 # is scanned) with the same code.
283 #
284 # See CodeRay.encode_stream and CodeRay.scan_stream
285 class TokenStream < Tokens
286
287 # Whether the object is a TokenStream.
288 #
289 # Returns true.
290 def stream?
291 true
292 end
293
294 # The Array is empty, but size counts the tokens given by <<.
295 attr_reader :size
296
297 # Creates a new TokenStream that calls +block+ whenever
298 # its << method is called.
299 #
300 # Example:
301 #
302 # require 'coderay'
303 #
304 # token_stream = CodeRay::TokenStream.new do |kind, text|
305 # puts 'kind: %s, text size: %d.' % [kind, text.size]
306 # end
307 #
308 # token_stream << [:regexp, '/\d+/']
309 # #-> kind: rexpexp, text size: 5.
310 #
311 def initialize &block
312 raise ArgumentError, 'Block expected for streaming.' unless block
313 @callback = block
314 @size = 0
315 end
316
317 # Calls +block+ with +token+ and increments size.
318 #
319 # Returns self.
320 def << token
321 @callback.call(*token)
322 @size += 1
323 self
324 end
325
326 # This method is not implemented due to speed reasons. Use Tokens.
327 def text_size
328 raise NotImplementedError,
329 'This method is not implemented due to speed reasons.'
330 end
331
332 # A TokenStream cannot be dumped. Use Tokens.
333 def dump
334 raise NotImplementedError, 'A TokenStream cannot be dumped.'
335 end
336
337 # A TokenStream cannot be optimized. Use Tokens.
338 def optimize
339 raise NotImplementedError, 'A TokenStream cannot be optimized.'
340 end
341
342 end
343
344 end
345
346 if $0 == __FILE__
347 $VERBOSE = true
348 $: << File.join(File.dirname(__FILE__), '..')
349 eval DATA.read, nil, $0, __LINE__ + 4
350 end
351
352 __END__
353 require 'test/unit'
354
355 class TokensTest < Test::Unit::TestCase
356
357 def test_creation
358 assert CodeRay::Tokens < Array
359 tokens = nil
360 assert_nothing_raised do
361 tokens = CodeRay::Tokens.new
362 end
363 assert_kind_of Array, tokens
364 end
365
366 def test_adding_tokens
367 tokens = CodeRay::Tokens.new
368 assert_nothing_raised do
369 tokens << ['string', :type]
370 tokens << ['()', :operator]
371 end
372 assert_equal tokens.size, 2
373 end
374
375 def test_dump_undump
376 tokens = CodeRay::Tokens.new
377 assert_nothing_raised do
378 tokens << ['string', :type]
379 tokens << ['()', :operator]
380 end
381 tokens2 = nil
382 assert_nothing_raised do
383 tokens2 = tokens.dump.undump
384 end
385 assert_equal tokens, tokens2
386 end
387
388 end