annotate vendor/plugins/coderay-0.9.2/lib/coderay/.svn/text-base/tokens.rb.svn-base @ 861:b8105f717bf7 bug_182

Close obsolete branch bug_182
author Chris Cannam
date Fri, 10 Jun 2011 16:49:58 +0100
parents 513646585e45
children
rev   line source
Chris@0 1 module CodeRay
Chris@0 2
Chris@0 3 # = Tokens
Chris@0 4 #
Chris@0 5 # The Tokens class represents a list of tokens returnd from
Chris@0 6 # a Scanner.
Chris@0 7 #
Chris@0 8 # A token is not a special object, just a two-element Array
Chris@0 9 # consisting of
Chris@0 10 # * the _token_ _kind_ (a Symbol representing the type of the token)
Chris@0 11 # * the _token_ _text_ (the original source of the token in a String)
Chris@0 12 #
Chris@0 13 # A token looks like this:
Chris@0 14 #
Chris@0 15 # [:comment, '# It looks like this']
Chris@0 16 # [:float, '3.1415926']
Chris@0 17 # [:error, '$^']
Chris@0 18 #
Chris@0 19 # Some scanners also yield some kind of sub-tokens, represented by special
Chris@0 20 # token texts, namely :open and :close .
Chris@0 21 #
Chris@0 22 # The Ruby scanner, for example, splits "a string" into:
Chris@0 23 #
Chris@0 24 # [
Chris@0 25 # [:open, :string],
Chris@0 26 # [:delimiter, '"'],
Chris@0 27 # [:content, 'a string'],
Chris@0 28 # [:delimiter, '"'],
Chris@0 29 # [:close, :string]
Chris@0 30 # ]
Chris@0 31 #
Chris@0 32 # Tokens is also the interface between Scanners and Encoders:
Chris@0 33 # The input is split and saved into a Tokens object. The Encoder
Chris@0 34 # then builds the output from this object.
Chris@0 35 #
Chris@0 36 # Thus, the syntax below becomes clear:
Chris@0 37 #
Chris@0 38 # CodeRay.scan('price = 2.59', :ruby).html
Chris@0 39 # # the Tokens object is here -------^
Chris@0 40 #
Chris@0 41 # See how small it is? ;)
Chris@0 42 #
Chris@0 43 # Tokens gives you the power to handle pre-scanned code very easily:
Chris@0 44 # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
Chris@0 45 # that you put in your DB.
Chris@0 46 #
Chris@0 47 # Tokens' subclass TokenStream allows streaming to save memory.
Chris@0 48 class Tokens < Array
Chris@0 49
Chris@0 50 # The Scanner instance that created the tokens.
Chris@0 51 attr_accessor :scanner
Chris@0 52
Chris@0 53 # Whether the object is a TokenStream.
Chris@0 54 #
Chris@0 55 # Returns false.
Chris@0 56 def stream?
Chris@0 57 false
Chris@0 58 end
Chris@0 59
Chris@0 60 # Iterates over all tokens.
Chris@0 61 #
Chris@0 62 # If a filter is given, only tokens of that kind are yielded.
Chris@0 63 def each kind_filter = nil, &block
Chris@0 64 unless kind_filter
Chris@0 65 super(&block)
Chris@0 66 else
Chris@0 67 super() do |text, kind|
Chris@0 68 next unless kind == kind_filter
Chris@0 69 yield text, kind
Chris@0 70 end
Chris@0 71 end
Chris@0 72 end
Chris@0 73
Chris@0 74 # Iterates over all text tokens.
Chris@0 75 # Range tokens like [:open, :string] are left out.
Chris@0 76 #
Chris@0 77 # Example:
Chris@0 78 # tokens.each_text_token { |text, kind| text.replace html_escape(text) }
Chris@0 79 def each_text_token
Chris@0 80 each do |text, kind|
Chris@0 81 next unless text.is_a? ::String
Chris@0 82 yield text, kind
Chris@0 83 end
Chris@0 84 end
Chris@0 85
Chris@0 86 # Encode the tokens using encoder.
Chris@0 87 #
Chris@0 88 # encoder can be
Chris@0 89 # * a symbol like :html oder :statistic
Chris@0 90 # * an Encoder class
Chris@0 91 # * an Encoder object
Chris@0 92 #
Chris@0 93 # options are passed to the encoder.
Chris@0 94 def encode encoder, options = {}
Chris@0 95 unless encoder.is_a? Encoders::Encoder
Chris@0 96 unless encoder.is_a? Class
Chris@0 97 encoder_class = Encoders[encoder]
Chris@0 98 end
Chris@0 99 encoder = encoder_class.new options
Chris@0 100 end
Chris@0 101 encoder.encode_tokens self, options
Chris@0 102 end
Chris@0 103
Chris@0 104
Chris@0 105 # Turn into a string using Encoders::Text.
Chris@0 106 #
Chris@0 107 # +options+ are passed to the encoder if given.
Chris@0 108 def to_s options = {}
Chris@0 109 encode :text, options
Chris@0 110 end
Chris@0 111
Chris@0 112 # Redirects unknown methods to encoder calls.
Chris@0 113 #
Chris@0 114 # For example, if you call +tokens.html+, the HTML encoder
Chris@0 115 # is used to highlight the tokens.
Chris@0 116 def method_missing meth, options = {}
Chris@0 117 Encoders[meth].new(options).encode_tokens self
Chris@0 118 end
Chris@0 119
Chris@0 120 # Returns the tokens compressed by joining consecutive
Chris@0 121 # tokens of the same kind.
Chris@0 122 #
Chris@0 123 # This can not be undone, but should yield the same output
Chris@0 124 # in most Encoders. It basically makes the output smaller.
Chris@0 125 #
Chris@0 126 # Combined with dump, it saves space for the cost of time.
Chris@0 127 #
Chris@0 128 # If the scanner is written carefully, this is not required -
Chris@0 129 # for example, consecutive //-comment lines could already be
Chris@0 130 # joined in one comment token by the Scanner.
Chris@0 131 def optimize
Chris@0 132 last_kind = last_text = nil
Chris@0 133 new = self.class.new
Chris@0 134 for text, kind in self
Chris@0 135 if text.is_a? String
Chris@0 136 if kind == last_kind
Chris@0 137 last_text << text
Chris@0 138 else
Chris@0 139 new << [last_text, last_kind] if last_kind
Chris@0 140 last_text = text
Chris@0 141 last_kind = kind
Chris@0 142 end
Chris@0 143 else
Chris@0 144 new << [last_text, last_kind] if last_kind
Chris@0 145 last_kind = last_text = nil
Chris@0 146 new << [text, kind]
Chris@0 147 end
Chris@0 148 end
Chris@0 149 new << [last_text, last_kind] if last_kind
Chris@0 150 new
Chris@0 151 end
Chris@0 152
Chris@0 153 # Compact the object itself; see optimize.
Chris@0 154 def optimize!
Chris@0 155 replace optimize
Chris@0 156 end
Chris@0 157
Chris@0 158 # Ensure that all :open tokens have a correspondent :close one.
Chris@0 159 #
Chris@0 160 # TODO: Test this!
Chris@0 161 def fix
Chris@0 162 tokens = self.class.new
Chris@0 163 # Check token nesting using a stack of kinds.
Chris@0 164 opened = []
Chris@0 165 for type, kind in self
Chris@0 166 case type
Chris@0 167 when :open
Chris@0 168 opened.push [:close, kind]
Chris@0 169 when :begin_line
Chris@0 170 opened.push [:end_line, kind]
Chris@0 171 when :close, :end_line
Chris@0 172 expected = opened.pop
Chris@0 173 if [type, kind] != expected
Chris@0 174 # Unexpected :close; decide what to do based on the kind:
Chris@0 175 # - token was never opened: delete the :close (just skip it)
Chris@0 176 next unless opened.rindex expected
Chris@0 177 # - token was opened earlier: also close tokens in between
Chris@0 178 tokens << token until (token = opened.pop) == expected
Chris@0 179 end
Chris@0 180 end
Chris@0 181 tokens << [type, kind]
Chris@0 182 end
Chris@0 183 # Close remaining opened tokens
Chris@0 184 tokens << token while token = opened.pop
Chris@0 185 tokens
Chris@0 186 end
Chris@0 187
Chris@0 188 def fix!
Chris@0 189 replace fix
Chris@0 190 end
Chris@0 191
Chris@0 192 # TODO: Scanner#split_into_lines
Chris@0 193 #
Chris@0 194 # Makes sure that:
Chris@0 195 # - newlines are single tokens
Chris@0 196 # (which means all other token are single-line)
Chris@0 197 # - there are no open tokens at the end the line
Chris@0 198 #
Chris@0 199 # This makes it simple for encoders that work line-oriented,
Chris@0 200 # like HTML with list-style numeration.
Chris@0 201 def split_into_lines
Chris@0 202 raise NotImplementedError
Chris@0 203 end
Chris@0 204
Chris@0 205 def split_into_lines!
Chris@0 206 replace split_into_lines
Chris@0 207 end
Chris@0 208
Chris@0 209 # Dumps the object into a String that can be saved
Chris@0 210 # in files or databases.
Chris@0 211 #
Chris@0 212 # The dump is created with Marshal.dump;
Chris@0 213 # In addition, it is gzipped using GZip.gzip.
Chris@0 214 #
Chris@0 215 # The returned String object includes Undumping
Chris@0 216 # so it has an #undump method. See Tokens.load.
Chris@0 217 #
Chris@0 218 # You can configure the level of compression,
Chris@0 219 # but the default value 7 should be what you want
Chris@0 220 # in most cases as it is a good compromise between
Chris@0 221 # speed and compression rate.
Chris@0 222 #
Chris@0 223 # See GZip module.
Chris@0 224 def dump gzip_level = 7
Chris@0 225 require 'coderay/helpers/gzip_simple'
Chris@0 226 dump = Marshal.dump self
Chris@0 227 dump = dump.gzip gzip_level
Chris@0 228 dump.extend Undumping
Chris@0 229 end
Chris@0 230
Chris@0 231 # The total size of the tokens.
Chris@0 232 # Should be equal to the input size before
Chris@0 233 # scanning.
Chris@0 234 def text_size
Chris@0 235 size = 0
Chris@0 236 each_text_token do |t, k|
Chris@0 237 size + t.size
Chris@0 238 end
Chris@0 239 size
Chris@0 240 end
Chris@0 241
Chris@0 242 # The total size of the tokens.
Chris@0 243 # Should be equal to the input size before
Chris@0 244 # scanning.
Chris@0 245 def text
Chris@0 246 map { |t, k| t if t.is_a? ::String }.join
Chris@0 247 end
Chris@0 248
Chris@0 249 # Include this module to give an object an #undump
Chris@0 250 # method.
Chris@0 251 #
Chris@0 252 # The string returned by Tokens.dump includes Undumping.
Chris@0 253 module Undumping
Chris@0 254 # Calls Tokens.load with itself.
Chris@0 255 def undump
Chris@0 256 Tokens.load self
Chris@0 257 end
Chris@0 258 end
Chris@0 259
Chris@0 260 # Undump the object using Marshal.load, then
Chris@0 261 # unzip it using GZip.gunzip.
Chris@0 262 #
Chris@0 263 # The result is commonly a Tokens object, but
Chris@0 264 # this is not guaranteed.
Chris@0 265 def Tokens.load dump
Chris@0 266 require 'coderay/helpers/gzip_simple'
Chris@0 267 dump = dump.gunzip
Chris@0 268 @dump = Marshal.load dump
Chris@0 269 end
Chris@0 270
Chris@0 271 end
Chris@0 272
Chris@0 273
Chris@0 274 # = TokenStream
Chris@0 275 #
Chris@0 276 # The TokenStream class is a fake Array without elements.
Chris@0 277 #
Chris@0 278 # It redirects the method << to a block given at creation.
Chris@0 279 #
Chris@0 280 # This allows scanners and Encoders to use streaming (no
Chris@0 281 # tokens are saved, the input is highlighted the same time it
Chris@0 282 # is scanned) with the same code.
Chris@0 283 #
Chris@0 284 # See CodeRay.encode_stream and CodeRay.scan_stream
Chris@0 285 class TokenStream < Tokens
Chris@0 286
Chris@0 287 # Whether the object is a TokenStream.
Chris@0 288 #
Chris@0 289 # Returns true.
Chris@0 290 def stream?
Chris@0 291 true
Chris@0 292 end
Chris@0 293
Chris@0 294 # The Array is empty, but size counts the tokens given by <<.
Chris@0 295 attr_reader :size
Chris@0 296
Chris@0 297 # Creates a new TokenStream that calls +block+ whenever
Chris@0 298 # its << method is called.
Chris@0 299 #
Chris@0 300 # Example:
Chris@0 301 #
Chris@0 302 # require 'coderay'
Chris@0 303 #
Chris@0 304 # token_stream = CodeRay::TokenStream.new do |kind, text|
Chris@0 305 # puts 'kind: %s, text size: %d.' % [kind, text.size]
Chris@0 306 # end
Chris@0 307 #
Chris@0 308 # token_stream << [:regexp, '/\d+/']
Chris@0 309 # #-> kind: rexpexp, text size: 5.
Chris@0 310 #
Chris@0 311 def initialize &block
Chris@0 312 raise ArgumentError, 'Block expected for streaming.' unless block
Chris@0 313 @callback = block
Chris@0 314 @size = 0
Chris@0 315 end
Chris@0 316
Chris@0 317 # Calls +block+ with +token+ and increments size.
Chris@0 318 #
Chris@0 319 # Returns self.
Chris@0 320 def << token
Chris@0 321 @callback.call(*token)
Chris@0 322 @size += 1
Chris@0 323 self
Chris@0 324 end
Chris@0 325
Chris@0 326 # This method is not implemented due to speed reasons. Use Tokens.
Chris@0 327 def text_size
Chris@0 328 raise NotImplementedError,
Chris@0 329 'This method is not implemented due to speed reasons.'
Chris@0 330 end
Chris@0 331
Chris@0 332 # A TokenStream cannot be dumped. Use Tokens.
Chris@0 333 def dump
Chris@0 334 raise NotImplementedError, 'A TokenStream cannot be dumped.'
Chris@0 335 end
Chris@0 336
Chris@0 337 # A TokenStream cannot be optimized. Use Tokens.
Chris@0 338 def optimize
Chris@0 339 raise NotImplementedError, 'A TokenStream cannot be optimized.'
Chris@0 340 end
Chris@0 341
Chris@0 342 end
Chris@0 343
Chris@0 344 end
Chris@0 345
Chris@0 346 if $0 == __FILE__
Chris@0 347 $VERBOSE = true
Chris@0 348 $: << File.join(File.dirname(__FILE__), '..')
Chris@0 349 eval DATA.read, nil, $0, __LINE__ + 4
Chris@0 350 end
Chris@0 351
Chris@0 352 __END__
Chris@0 353 require 'test/unit'
Chris@0 354
Chris@0 355 class TokensTest < Test::Unit::TestCase
Chris@0 356
Chris@0 357 def test_creation
Chris@0 358 assert CodeRay::Tokens < Array
Chris@0 359 tokens = nil
Chris@0 360 assert_nothing_raised do
Chris@0 361 tokens = CodeRay::Tokens.new
Chris@0 362 end
Chris@0 363 assert_kind_of Array, tokens
Chris@0 364 end
Chris@0 365
Chris@0 366 def test_adding_tokens
Chris@0 367 tokens = CodeRay::Tokens.new
Chris@0 368 assert_nothing_raised do
Chris@0 369 tokens << ['string', :type]
Chris@0 370 tokens << ['()', :operator]
Chris@0 371 end
Chris@0 372 assert_equal tokens.size, 2
Chris@0 373 end
Chris@0 374
Chris@0 375 def test_dump_undump
Chris@0 376 tokens = CodeRay::Tokens.new
Chris@0 377 assert_nothing_raised do
Chris@0 378 tokens << ['string', :type]
Chris@0 379 tokens << ['()', :operator]
Chris@0 380 end
Chris@0 381 tokens2 = nil
Chris@0 382 assert_nothing_raised do
Chris@0 383 tokens2 = tokens.dump.undump
Chris@0 384 end
Chris@0 385 assert_equal tokens, tokens2
Chris@0 386 end
Chris@0 387
Chris@0 388 end