To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / vendor / gems / coderay-0.9.7 / lib / coderay / tokens.rb @ 442:753f1380d6bc

History | View | Annotate | Download (10.1 KB)

1
module CodeRay
2

    
3
  # = Tokens
4
  #
5
  # The Tokens class represents a list of tokens returnd from
6
  # a Scanner.
7
  #
8
  # A token is not a special object, just a two-element Array
9
  # consisting of
10
  # * the _token_ _text_ (the original source of the token in a String) or
11
  #   a _token_ _action_ (:open, :close, :begin_line, :end_line)
12
  # * the _token_ _kind_ (a Symbol representing the type of the token)
13
  #
14
  # A token looks like this:
15
  #
16
  #   ['# It looks like this', :comment]
17
  #   ['3.1415926', :float]
18
  #   ['$^', :error]
19
  #
20
  # Some scanners also yield sub-tokens, represented by special
21
  # token actions, namely :open and :close.
22
  #
23
  # The Ruby scanner, for example, splits "a string" into:
24
  #
25
  #  [
26
  #   [:open, :string],
27
  #   ['"', :delimiter],
28
  #   ['a string', :content],
29
  #   ['"', :delimiter],
30
  #   [:close, :string]
31
  #  ]
32
  #
33
  # Tokens is the interface between Scanners and Encoders:
34
  # The input is split and saved into a Tokens object. The Encoder
35
  # then builds the output from this object.
36
  #
37
  # Thus, the syntax below becomes clear:
38
  #
39
  #   CodeRay.scan('price = 2.59', :ruby).html
40
  #   # the Tokens object is here -------^
41
  #
42
  # See how small it is? ;)
43
  #
44
  # Tokens gives you the power to handle pre-scanned code very easily:
45
  # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
46
  # that you put in your DB.
47
  # 
48
  # It also allows you to generate tokens directly (without using a scanner),
49
  # to load them from a file, and still use any Encoder that CodeRay provides.
50
  #
51
  # Tokens' subclass TokenStream allows streaming to save memory.
52
  class Tokens < Array
53
    
54
    # The Scanner instance that created the tokens.
55
    attr_accessor :scanner
56
    
57
    # Whether the object is a TokenStream.
58
    #
59
    # Returns false.
60
    def stream?
61
      false
62
    end
63

    
64
    # Iterates over all tokens.
65
    #
66
    # If a filter is given, only tokens of that kind are yielded.
67
    def each kind_filter = nil, &block
68
      unless kind_filter
69
        super(&block)
70
      else
71
        super() do |text, kind|
72
          next unless kind == kind_filter
73
          yield text, kind
74
        end
75
      end
76
    end
77

    
78
    # Iterates over all text tokens.
79
    # Range tokens like [:open, :string] are left out.
80
    #
81
    # Example:
82
    #   tokens.each_text_token { |text, kind| text.replace html_escape(text) }
83
    def each_text_token
84
      each do |text, kind|
85
        next unless text.is_a? ::String
86
        yield text, kind
87
      end
88
    end
89

    
90
    # Encode the tokens using encoder.
91
    #
92
    # encoder can be
93
    # * a symbol like :html oder :statistic
94
    # * an Encoder class
95
    # * an Encoder object
96
    #
97
    # options are passed to the encoder.
98
    def encode encoder, options = {}
99
      unless encoder.is_a? Encoders::Encoder
100
        unless encoder.is_a? Class
101
          encoder_class = Encoders[encoder]
102
        end
103
        encoder = encoder_class.new options
104
      end
105
      encoder.encode_tokens self, options
106
    end
107

    
108

    
109
    # Turn into a string using Encoders::Text.
110
    #
111
    # +options+ are passed to the encoder if given.
112
    def to_s options = {}
113
      encode :text, options
114
    end
115

    
116
    # Redirects unknown methods to encoder calls.
117
    #
118
    # For example, if you call +tokens.html+, the HTML encoder
119
    # is used to highlight the tokens.
120
    def method_missing meth, options = {}
121
      Encoders[meth].new(options).encode_tokens self
122
    end
123

    
124
    # Returns the tokens compressed by joining consecutive
125
    # tokens of the same kind.
126
    #
127
    # This can not be undone, but should yield the same output
128
    # in most Encoders.  It basically makes the output smaller.
129
    #
130
    # Combined with dump, it saves space for the cost of time.
131
    #
132
    # If the scanner is written carefully, this is not required -
133
    # for example, consecutive //-comment lines could already be
134
    # joined in one comment token by the Scanner.
135
    def optimize
136
      last_kind = last_text = nil
137
      new = self.class.new
138
      for text, kind in self
139
        if text.is_a? String
140
          if kind == last_kind
141
            last_text << text
142
          else
143
            new << [last_text, last_kind] if last_kind
144
            last_text = text
145
            last_kind = kind
146
          end
147
        else
148
          new << [last_text, last_kind] if last_kind
149
          last_kind = last_text = nil
150
          new << [text, kind]
151
        end
152
      end
153
      new << [last_text, last_kind] if last_kind
154
      new
155
    end
156

    
157
    # Compact the object itself; see optimize.
158
    def optimize!
159
      replace optimize
160
    end
161
    
162
    # Ensure that all :open tokens have a correspondent :close one.
163
    #
164
    # TODO: Test this!
165
    def fix
166
      tokens = self.class.new
167
      # Check token nesting using a stack of kinds.
168
      opened = []
169
      for type, kind in self
170
        case type
171
        when :open
172
          opened.push [:close, kind]
173
        when :begin_line
174
          opened.push [:end_line, kind]
175
        when :close, :end_line
176
          expected = opened.pop
177
          if [type, kind] != expected
178
            # Unexpected :close; decide what to do based on the kind:
179
            # - token was never opened: delete the :close (just skip it)
180
            next unless opened.rindex expected
181
            # - token was opened earlier: also close tokens in between
182
            tokens << token until (token = opened.pop) == expected
183
          end
184
        end
185
        tokens << [type, kind]
186
      end
187
      # Close remaining opened tokens
188
      tokens << token while token = opened.pop
189
      tokens
190
    end
191
    
192
    def fix!
193
      replace fix
194
    end
195
    
196
    # TODO: Scanner#split_into_lines
197
    # 
198
    # Makes sure that:
199
    # - newlines are single tokens
200
    #   (which means all other token are single-line)
201
    # - there are no open tokens at the end the line
202
    #
203
    # This makes it simple for encoders that work line-oriented,
204
    # like HTML with list-style numeration.
205
    def split_into_lines
206
      raise NotImplementedError
207
    end
208

    
209
    def split_into_lines!
210
      replace split_into_lines
211
    end
212

    
213
    # Dumps the object into a String that can be saved
214
    # in files or databases.
215
    #
216
    # The dump is created with Marshal.dump;
217
    # In addition, it is gzipped using GZip.gzip.
218
    #
219
    # The returned String object includes Undumping
220
    # so it has an #undump method. See Tokens.load.
221
    #
222
    # You can configure the level of compression,
223
    # but the default value 7 should be what you want
224
    # in most cases as it is a good compromise between
225
    # speed and compression rate.
226
    #
227
    # See GZip module.
228
    def dump gzip_level = 7
229
      require 'coderay/helpers/gzip_simple'
230
      dump = Marshal.dump self
231
      dump = dump.gzip gzip_level
232
      dump.extend Undumping
233
    end
234

    
235
    # The total size of the tokens.
236
    # Should be equal to the input size before
237
    # scanning.
238
    def text_size
239
      size = 0
240
      each_text_token do |t, k|
241
        size + t.size
242
      end
243
      size
244
    end
245

    
246
    # Return all text tokens joined into a single string.
247
    def text
248
      map { |t, k| t if t.is_a? ::String }.join
249
    end
250

    
251
    # Include this module to give an object an #undump
252
    # method.
253
    #
254
    # The string returned by Tokens.dump includes Undumping.
255
    module Undumping
256
      # Calls Tokens.load with itself.
257
      def undump
258
        Tokens.load self
259
      end
260
    end
261

    
262
    # Undump the object using Marshal.load, then
263
    # unzip it using GZip.gunzip.
264
    #
265
    # The result is commonly a Tokens object, but
266
    # this is not guaranteed.
267
    def Tokens.load dump
268
      require 'coderay/helpers/gzip_simple'
269
      dump = dump.gunzip
270
      @dump = Marshal.load dump
271
    end
272

    
273
  end
274

    
275

    
276
  # = TokenStream
277
  #
278
  # The TokenStream class is a fake Array without elements.
279
  #
280
  # It redirects the method << to a block given at creation.
281
  #
282
  # This allows scanners and Encoders to use streaming (no
283
  # tokens are saved, the input is highlighted the same time it
284
  # is scanned) with the same code.
285
  #
286
  # See CodeRay.encode_stream and CodeRay.scan_stream
287
  class TokenStream < Tokens
288

    
289
    # Whether the object is a TokenStream.
290
    #
291
    # Returns true.
292
    def stream?
293
      true
294
    end
295

    
296
    # The Array is empty, but size counts the tokens given by <<.
297
    attr_reader :size
298

    
299
    # Creates a new TokenStream that calls +block+ whenever
300
    # its << method is called.
301
    #
302
    # Example:
303
    #
304
    #   require 'coderay'
305
    #   
306
    #   token_stream = CodeRay::TokenStream.new do |text, kind|
307
    #     puts 'kind: %s, text size: %d.' % [kind, text.size]
308
    #   end
309
    #   
310
    #   token_stream << ['/\d+/', :regexp]
311
    #   #-> kind: rexpexp, text size: 5.
312
    #
313
    def initialize &block
314
      raise ArgumentError, 'Block expected for streaming.' unless block
315
      @callback = block
316
      @size = 0
317
    end
318

    
319
    # Calls +block+ with +token+ and increments size.
320
    #
321
    # Returns self.
322
    def << token
323
      @callback.call(*token)
324
      @size += 1
325
      self
326
    end
327

    
328
    # This method is not implemented due to speed reasons. Use Tokens.
329
    def text_size
330
      raise NotImplementedError,
331
        'This method is not implemented due to speed reasons.'
332
    end
333

    
334
    # A TokenStream cannot be dumped. Use Tokens.
335
    def dump
336
      raise NotImplementedError, 'A TokenStream cannot be dumped.'
337
    end
338

    
339
    # A TokenStream cannot be optimized. Use Tokens.
340
    def optimize
341
      raise NotImplementedError, 'A TokenStream cannot be optimized.'
342
    end
343

    
344
  end
345

    
346
end
347

    
348
if $0 == __FILE__
349
  $VERBOSE = true
350
  $: << File.join(File.dirname(__FILE__), '..')
351
  eval DATA.read, nil, $0, __LINE__ + 4
352
end
353

    
354
__END__
355
require 'test/unit'
356

357
class TokensTest < Test::Unit::TestCase
358
  
359
  def test_creation
360
    assert CodeRay::Tokens < Array
361
    tokens = nil
362
    assert_nothing_raised do
363
      tokens = CodeRay::Tokens.new
364
    end
365
    assert_kind_of Array, tokens
366
  end
367
  
368
  def test_adding_tokens
369
    tokens = CodeRay::Tokens.new
370
    assert_nothing_raised do
371
      tokens << ['string', :type]
372
      tokens << ['()', :operator]
373
    end
374
    assert_equal tokens.size, 2
375
  end
376
  
377
  def test_dump_undump
378
    tokens = CodeRay::Tokens.new
379
    assert_nothing_raised do
380
      tokens << ['string', :type]
381
      tokens << ['()', :operator]
382
    end
383
    tokens2 = nil
384
    assert_nothing_raised do
385
      tokens2 = tokens.dump.undump
386
    end
387
    assert_equal tokens, tokens2
388
  end
389
  
390
end