Chris@0
|
1 module CodeRay
|
Chris@0
|
2
|
Chris@0
|
3 # = Tokens
|
Chris@0
|
4 #
|
Chris@0
|
5 # The Tokens class represents a list of tokens returnd from
|
Chris@0
|
6 # a Scanner.
|
Chris@0
|
7 #
|
Chris@0
|
8 # A token is not a special object, just a two-element Array
|
Chris@0
|
9 # consisting of
|
Chris@0
|
10 # * the _token_ _kind_ (a Symbol representing the type of the token)
|
Chris@0
|
11 # * the _token_ _text_ (the original source of the token in a String)
|
Chris@0
|
12 #
|
Chris@0
|
13 # A token looks like this:
|
Chris@0
|
14 #
|
Chris@0
|
15 # [:comment, '# It looks like this']
|
Chris@0
|
16 # [:float, '3.1415926']
|
Chris@0
|
17 # [:error, '$^']
|
Chris@0
|
18 #
|
Chris@0
|
19 # Some scanners also yield some kind of sub-tokens, represented by special
|
Chris@0
|
20 # token texts, namely :open and :close .
|
Chris@0
|
21 #
|
Chris@0
|
22 # The Ruby scanner, for example, splits "a string" into:
|
Chris@0
|
23 #
|
Chris@0
|
24 # [
|
Chris@0
|
25 # [:open, :string],
|
Chris@0
|
26 # [:delimiter, '"'],
|
Chris@0
|
27 # [:content, 'a string'],
|
Chris@0
|
28 # [:delimiter, '"'],
|
Chris@0
|
29 # [:close, :string]
|
Chris@0
|
30 # ]
|
Chris@0
|
31 #
|
Chris@0
|
32 # Tokens is also the interface between Scanners and Encoders:
|
Chris@0
|
33 # The input is split and saved into a Tokens object. The Encoder
|
Chris@0
|
34 # then builds the output from this object.
|
Chris@0
|
35 #
|
Chris@0
|
36 # Thus, the syntax below becomes clear:
|
Chris@0
|
37 #
|
Chris@0
|
38 # CodeRay.scan('price = 2.59', :ruby).html
|
Chris@0
|
39 # # the Tokens object is here -------^
|
Chris@0
|
40 #
|
Chris@0
|
41 # See how small it is? ;)
|
Chris@0
|
42 #
|
Chris@0
|
43 # Tokens gives you the power to handle pre-scanned code very easily:
|
Chris@0
|
44 # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
|
Chris@0
|
45 # that you put in your DB.
|
Chris@0
|
46 #
|
Chris@0
|
47 # Tokens' subclass TokenStream allows streaming to save memory.
|
Chris@0
|
48 class Tokens < Array
|
Chris@0
|
49
|
Chris@0
|
50 # The Scanner instance that created the tokens.
|
Chris@0
|
51 attr_accessor :scanner
|
Chris@0
|
52
|
Chris@0
|
53 # Whether the object is a TokenStream.
|
Chris@0
|
54 #
|
Chris@0
|
55 # Returns false.
|
Chris@0
|
56 def stream?
|
Chris@0
|
57 false
|
Chris@0
|
58 end
|
Chris@0
|
59
|
Chris@0
|
60 # Iterates over all tokens.
|
Chris@0
|
61 #
|
Chris@0
|
62 # If a filter is given, only tokens of that kind are yielded.
|
Chris@0
|
63 def each kind_filter = nil, &block
|
Chris@0
|
64 unless kind_filter
|
Chris@0
|
65 super(&block)
|
Chris@0
|
66 else
|
Chris@0
|
67 super() do |text, kind|
|
Chris@0
|
68 next unless kind == kind_filter
|
Chris@0
|
69 yield text, kind
|
Chris@0
|
70 end
|
Chris@0
|
71 end
|
Chris@0
|
72 end
|
Chris@0
|
73
|
Chris@0
|
74 # Iterates over all text tokens.
|
Chris@0
|
75 # Range tokens like [:open, :string] are left out.
|
Chris@0
|
76 #
|
Chris@0
|
77 # Example:
|
Chris@0
|
78 # tokens.each_text_token { |text, kind| text.replace html_escape(text) }
|
Chris@0
|
79 def each_text_token
|
Chris@0
|
80 each do |text, kind|
|
Chris@0
|
81 next unless text.is_a? ::String
|
Chris@0
|
82 yield text, kind
|
Chris@0
|
83 end
|
Chris@0
|
84 end
|
Chris@0
|
85
|
Chris@0
|
86 # Encode the tokens using encoder.
|
Chris@0
|
87 #
|
Chris@0
|
88 # encoder can be
|
Chris@0
|
89 # * a symbol like :html oder :statistic
|
Chris@0
|
90 # * an Encoder class
|
Chris@0
|
91 # * an Encoder object
|
Chris@0
|
92 #
|
Chris@0
|
93 # options are passed to the encoder.
|
Chris@0
|
94 def encode encoder, options = {}
|
Chris@0
|
95 unless encoder.is_a? Encoders::Encoder
|
Chris@0
|
96 unless encoder.is_a? Class
|
Chris@0
|
97 encoder_class = Encoders[encoder]
|
Chris@0
|
98 end
|
Chris@0
|
99 encoder = encoder_class.new options
|
Chris@0
|
100 end
|
Chris@0
|
101 encoder.encode_tokens self, options
|
Chris@0
|
102 end
|
Chris@0
|
103
|
Chris@0
|
104
|
Chris@0
|
105 # Turn into a string using Encoders::Text.
|
Chris@0
|
106 #
|
Chris@0
|
107 # +options+ are passed to the encoder if given.
|
Chris@0
|
108 def to_s options = {}
|
Chris@0
|
109 encode :text, options
|
Chris@0
|
110 end
|
Chris@0
|
111
|
Chris@0
|
112 # Redirects unknown methods to encoder calls.
|
Chris@0
|
113 #
|
Chris@0
|
114 # For example, if you call +tokens.html+, the HTML encoder
|
Chris@0
|
115 # is used to highlight the tokens.
|
Chris@0
|
116 def method_missing meth, options = {}
|
Chris@0
|
117 Encoders[meth].new(options).encode_tokens self
|
Chris@0
|
118 end
|
Chris@0
|
119
|
Chris@0
|
120 # Returns the tokens compressed by joining consecutive
|
Chris@0
|
121 # tokens of the same kind.
|
Chris@0
|
122 #
|
Chris@0
|
123 # This can not be undone, but should yield the same output
|
Chris@0
|
124 # in most Encoders. It basically makes the output smaller.
|
Chris@0
|
125 #
|
Chris@0
|
126 # Combined with dump, it saves space for the cost of time.
|
Chris@0
|
127 #
|
Chris@0
|
128 # If the scanner is written carefully, this is not required -
|
Chris@0
|
129 # for example, consecutive //-comment lines could already be
|
Chris@0
|
130 # joined in one comment token by the Scanner.
|
Chris@0
|
131 def optimize
|
Chris@0
|
132 last_kind = last_text = nil
|
Chris@0
|
133 new = self.class.new
|
Chris@0
|
134 for text, kind in self
|
Chris@0
|
135 if text.is_a? String
|
Chris@0
|
136 if kind == last_kind
|
Chris@0
|
137 last_text << text
|
Chris@0
|
138 else
|
Chris@0
|
139 new << [last_text, last_kind] if last_kind
|
Chris@0
|
140 last_text = text
|
Chris@0
|
141 last_kind = kind
|
Chris@0
|
142 end
|
Chris@0
|
143 else
|
Chris@0
|
144 new << [last_text, last_kind] if last_kind
|
Chris@0
|
145 last_kind = last_text = nil
|
Chris@0
|
146 new << [text, kind]
|
Chris@0
|
147 end
|
Chris@0
|
148 end
|
Chris@0
|
149 new << [last_text, last_kind] if last_kind
|
Chris@0
|
150 new
|
Chris@0
|
151 end
|
Chris@0
|
152
|
Chris@0
|
153 # Compact the object itself; see optimize.
|
Chris@0
|
154 def optimize!
|
Chris@0
|
155 replace optimize
|
Chris@0
|
156 end
|
Chris@0
|
157
|
Chris@0
|
158 # Ensure that all :open tokens have a correspondent :close one.
|
Chris@0
|
159 #
|
Chris@0
|
160 # TODO: Test this!
|
Chris@0
|
161 def fix
|
Chris@0
|
162 tokens = self.class.new
|
Chris@0
|
163 # Check token nesting using a stack of kinds.
|
Chris@0
|
164 opened = []
|
Chris@0
|
165 for type, kind in self
|
Chris@0
|
166 case type
|
Chris@0
|
167 when :open
|
Chris@0
|
168 opened.push [:close, kind]
|
Chris@0
|
169 when :begin_line
|
Chris@0
|
170 opened.push [:end_line, kind]
|
Chris@0
|
171 when :close, :end_line
|
Chris@0
|
172 expected = opened.pop
|
Chris@0
|
173 if [type, kind] != expected
|
Chris@0
|
174 # Unexpected :close; decide what to do based on the kind:
|
Chris@0
|
175 # - token was never opened: delete the :close (just skip it)
|
Chris@0
|
176 next unless opened.rindex expected
|
Chris@0
|
177 # - token was opened earlier: also close tokens in between
|
Chris@0
|
178 tokens << token until (token = opened.pop) == expected
|
Chris@0
|
179 end
|
Chris@0
|
180 end
|
Chris@0
|
181 tokens << [type, kind]
|
Chris@0
|
182 end
|
Chris@0
|
183 # Close remaining opened tokens
|
Chris@0
|
184 tokens << token while token = opened.pop
|
Chris@0
|
185 tokens
|
Chris@0
|
186 end
|
Chris@0
|
187
|
Chris@0
|
188 def fix!
|
Chris@0
|
189 replace fix
|
Chris@0
|
190 end
|
Chris@0
|
191
|
Chris@0
|
192 # TODO: Scanner#split_into_lines
|
Chris@0
|
193 #
|
Chris@0
|
194 # Makes sure that:
|
Chris@0
|
195 # - newlines are single tokens
|
Chris@0
|
196 # (which means all other token are single-line)
|
Chris@0
|
197 # - there are no open tokens at the end the line
|
Chris@0
|
198 #
|
Chris@0
|
199 # This makes it simple for encoders that work line-oriented,
|
Chris@0
|
200 # like HTML with list-style numeration.
|
Chris@0
|
201 def split_into_lines
|
Chris@0
|
202 raise NotImplementedError
|
Chris@0
|
203 end
|
Chris@0
|
204
|
Chris@0
|
205 def split_into_lines!
|
Chris@0
|
206 replace split_into_lines
|
Chris@0
|
207 end
|
Chris@0
|
208
|
Chris@0
|
209 # Dumps the object into a String that can be saved
|
Chris@0
|
210 # in files or databases.
|
Chris@0
|
211 #
|
Chris@0
|
212 # The dump is created with Marshal.dump;
|
Chris@0
|
213 # In addition, it is gzipped using GZip.gzip.
|
Chris@0
|
214 #
|
Chris@0
|
215 # The returned String object includes Undumping
|
Chris@0
|
216 # so it has an #undump method. See Tokens.load.
|
Chris@0
|
217 #
|
Chris@0
|
218 # You can configure the level of compression,
|
Chris@0
|
219 # but the default value 7 should be what you want
|
Chris@0
|
220 # in most cases as it is a good compromise between
|
Chris@0
|
221 # speed and compression rate.
|
Chris@0
|
222 #
|
Chris@0
|
223 # See GZip module.
|
Chris@0
|
224 def dump gzip_level = 7
|
Chris@0
|
225 require 'coderay/helpers/gzip_simple'
|
Chris@0
|
226 dump = Marshal.dump self
|
Chris@0
|
227 dump = dump.gzip gzip_level
|
Chris@0
|
228 dump.extend Undumping
|
Chris@0
|
229 end
|
Chris@0
|
230
|
Chris@0
|
231 # The total size of the tokens.
|
Chris@0
|
232 # Should be equal to the input size before
|
Chris@0
|
233 # scanning.
|
Chris@0
|
234 def text_size
|
Chris@0
|
235 size = 0
|
Chris@0
|
236 each_text_token do |t, k|
|
Chris@0
|
237 size + t.size
|
Chris@0
|
238 end
|
Chris@0
|
239 size
|
Chris@0
|
240 end
|
Chris@0
|
241
|
Chris@0
|
242 # The total size of the tokens.
|
Chris@0
|
243 # Should be equal to the input size before
|
Chris@0
|
244 # scanning.
|
Chris@0
|
245 def text
|
Chris@0
|
246 map { |t, k| t if t.is_a? ::String }.join
|
Chris@0
|
247 end
|
Chris@0
|
248
|
Chris@0
|
249 # Include this module to give an object an #undump
|
Chris@0
|
250 # method.
|
Chris@0
|
251 #
|
Chris@0
|
252 # The string returned by Tokens.dump includes Undumping.
|
Chris@0
|
253 module Undumping
|
Chris@0
|
254 # Calls Tokens.load with itself.
|
Chris@0
|
255 def undump
|
Chris@0
|
256 Tokens.load self
|
Chris@0
|
257 end
|
Chris@0
|
258 end
|
Chris@0
|
259
|
Chris@0
|
260 # Undump the object using Marshal.load, then
|
Chris@0
|
261 # unzip it using GZip.gunzip.
|
Chris@0
|
262 #
|
Chris@0
|
263 # The result is commonly a Tokens object, but
|
Chris@0
|
264 # this is not guaranteed.
|
Chris@0
|
265 def Tokens.load dump
|
Chris@0
|
266 require 'coderay/helpers/gzip_simple'
|
Chris@0
|
267 dump = dump.gunzip
|
Chris@0
|
268 @dump = Marshal.load dump
|
Chris@0
|
269 end
|
Chris@0
|
270
|
Chris@0
|
271 end
|
Chris@0
|
272
|
Chris@0
|
273
|
Chris@0
|
274 # = TokenStream
|
Chris@0
|
275 #
|
Chris@0
|
276 # The TokenStream class is a fake Array without elements.
|
Chris@0
|
277 #
|
Chris@0
|
278 # It redirects the method << to a block given at creation.
|
Chris@0
|
279 #
|
Chris@0
|
280 # This allows scanners and Encoders to use streaming (no
|
Chris@0
|
281 # tokens are saved, the input is highlighted the same time it
|
Chris@0
|
282 # is scanned) with the same code.
|
Chris@0
|
283 #
|
Chris@0
|
284 # See CodeRay.encode_stream and CodeRay.scan_stream
|
Chris@0
|
285 class TokenStream < Tokens
|
Chris@0
|
286
|
Chris@0
|
287 # Whether the object is a TokenStream.
|
Chris@0
|
288 #
|
Chris@0
|
289 # Returns true.
|
Chris@0
|
290 def stream?
|
Chris@0
|
291 true
|
Chris@0
|
292 end
|
Chris@0
|
293
|
Chris@0
|
294 # The Array is empty, but size counts the tokens given by <<.
|
Chris@0
|
295 attr_reader :size
|
Chris@0
|
296
|
Chris@0
|
297 # Creates a new TokenStream that calls +block+ whenever
|
Chris@0
|
298 # its << method is called.
|
Chris@0
|
299 #
|
Chris@0
|
300 # Example:
|
Chris@0
|
301 #
|
Chris@0
|
302 # require 'coderay'
|
Chris@0
|
303 #
|
Chris@0
|
304 # token_stream = CodeRay::TokenStream.new do |kind, text|
|
Chris@0
|
305 # puts 'kind: %s, text size: %d.' % [kind, text.size]
|
Chris@0
|
306 # end
|
Chris@0
|
307 #
|
Chris@0
|
308 # token_stream << [:regexp, '/\d+/']
|
Chris@0
|
309 # #-> kind: rexpexp, text size: 5.
|
Chris@0
|
310 #
|
Chris@0
|
311 def initialize &block
|
Chris@0
|
312 raise ArgumentError, 'Block expected for streaming.' unless block
|
Chris@0
|
313 @callback = block
|
Chris@0
|
314 @size = 0
|
Chris@0
|
315 end
|
Chris@0
|
316
|
Chris@0
|
317 # Calls +block+ with +token+ and increments size.
|
Chris@0
|
318 #
|
Chris@0
|
319 # Returns self.
|
Chris@0
|
320 def << token
|
Chris@0
|
321 @callback.call(*token)
|
Chris@0
|
322 @size += 1
|
Chris@0
|
323 self
|
Chris@0
|
324 end
|
Chris@0
|
325
|
Chris@0
|
326 # This method is not implemented due to speed reasons. Use Tokens.
|
Chris@0
|
327 def text_size
|
Chris@0
|
328 raise NotImplementedError,
|
Chris@0
|
329 'This method is not implemented due to speed reasons.'
|
Chris@0
|
330 end
|
Chris@0
|
331
|
Chris@0
|
332 # A TokenStream cannot be dumped. Use Tokens.
|
Chris@0
|
333 def dump
|
Chris@0
|
334 raise NotImplementedError, 'A TokenStream cannot be dumped.'
|
Chris@0
|
335 end
|
Chris@0
|
336
|
Chris@0
|
337 # A TokenStream cannot be optimized. Use Tokens.
|
Chris@0
|
338 def optimize
|
Chris@0
|
339 raise NotImplementedError, 'A TokenStream cannot be optimized.'
|
Chris@0
|
340 end
|
Chris@0
|
341
|
Chris@0
|
342 end
|
Chris@0
|
343
|
Chris@0
|
344 end
|
Chris@0
|
345
|
Chris@0
|
346 if $0 == __FILE__
|
Chris@0
|
347 $VERBOSE = true
|
Chris@0
|
348 $: << File.join(File.dirname(__FILE__), '..')
|
Chris@0
|
349 eval DATA.read, nil, $0, __LINE__ + 4
|
Chris@0
|
350 end
|
Chris@0
|
351
|
Chris@0
|
352 __END__
|
Chris@0
|
353 require 'test/unit'
|
Chris@0
|
354
|
Chris@0
|
355 class TokensTest < Test::Unit::TestCase
|
Chris@0
|
356
|
Chris@0
|
357 def test_creation
|
Chris@0
|
358 assert CodeRay::Tokens < Array
|
Chris@0
|
359 tokens = nil
|
Chris@0
|
360 assert_nothing_raised do
|
Chris@0
|
361 tokens = CodeRay::Tokens.new
|
Chris@0
|
362 end
|
Chris@0
|
363 assert_kind_of Array, tokens
|
Chris@0
|
364 end
|
Chris@0
|
365
|
Chris@0
|
366 def test_adding_tokens
|
Chris@0
|
367 tokens = CodeRay::Tokens.new
|
Chris@0
|
368 assert_nothing_raised do
|
Chris@0
|
369 tokens << ['string', :type]
|
Chris@0
|
370 tokens << ['()', :operator]
|
Chris@0
|
371 end
|
Chris@0
|
372 assert_equal tokens.size, 2
|
Chris@0
|
373 end
|
Chris@0
|
374
|
Chris@0
|
375 def test_dump_undump
|
Chris@0
|
376 tokens = CodeRay::Tokens.new
|
Chris@0
|
377 assert_nothing_raised do
|
Chris@0
|
378 tokens << ['string', :type]
|
Chris@0
|
379 tokens << ['()', :operator]
|
Chris@0
|
380 end
|
Chris@0
|
381 tokens2 = nil
|
Chris@0
|
382 assert_nothing_raised do
|
Chris@0
|
383 tokens2 = tokens.dump.undump
|
Chris@0
|
384 end
|
Chris@0
|
385 assert_equal tokens, tokens2
|
Chris@0
|
386 end
|
Chris@0
|
387
|
Chris@0
|
388 end |