annotate vendor/gems/coderay-0.9.7/lib/coderay/tokens.rb @ 855:7294e8db2515 bug_162

Close obsolete branch bug_162
author Chris Cannam
date Thu, 14 Jul 2011 11:59:19 +0100
parents 0579821a129a
children
rev   line source
Chris@210 1 module CodeRay
Chris@210 2
Chris@210 3 # = Tokens
Chris@210 4 #
Chris@210 5 # The Tokens class represents a list of tokens returnd from
Chris@210 6 # a Scanner.
Chris@210 7 #
Chris@210 8 # A token is not a special object, just a two-element Array
Chris@210 9 # consisting of
Chris@210 10 # * the _token_ _text_ (the original source of the token in a String) or
Chris@210 11 # a _token_ _action_ (:open, :close, :begin_line, :end_line)
Chris@210 12 # * the _token_ _kind_ (a Symbol representing the type of the token)
Chris@210 13 #
Chris@210 14 # A token looks like this:
Chris@210 15 #
Chris@210 16 # ['# It looks like this', :comment]
Chris@210 17 # ['3.1415926', :float]
Chris@210 18 # ['$^', :error]
Chris@210 19 #
Chris@210 20 # Some scanners also yield sub-tokens, represented by special
Chris@210 21 # token actions, namely :open and :close.
Chris@210 22 #
Chris@210 23 # The Ruby scanner, for example, splits "a string" into:
Chris@210 24 #
Chris@210 25 # [
Chris@210 26 # [:open, :string],
Chris@210 27 # ['"', :delimiter],
Chris@210 28 # ['a string', :content],
Chris@210 29 # ['"', :delimiter],
Chris@210 30 # [:close, :string]
Chris@210 31 # ]
Chris@210 32 #
Chris@210 33 # Tokens is the interface between Scanners and Encoders:
Chris@210 34 # The input is split and saved into a Tokens object. The Encoder
Chris@210 35 # then builds the output from this object.
Chris@210 36 #
Chris@210 37 # Thus, the syntax below becomes clear:
Chris@210 38 #
Chris@210 39 # CodeRay.scan('price = 2.59', :ruby).html
Chris@210 40 # # the Tokens object is here -------^
Chris@210 41 #
Chris@210 42 # See how small it is? ;)
Chris@210 43 #
Chris@210 44 # Tokens gives you the power to handle pre-scanned code very easily:
Chris@210 45 # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
Chris@210 46 # that you put in your DB.
Chris@210 47 #
Chris@210 48 # It also allows you to generate tokens directly (without using a scanner),
Chris@210 49 # to load them from a file, and still use any Encoder that CodeRay provides.
Chris@210 50 #
Chris@210 51 # Tokens' subclass TokenStream allows streaming to save memory.
Chris@210 52 class Tokens < Array
Chris@210 53
Chris@210 54 # The Scanner instance that created the tokens.
Chris@210 55 attr_accessor :scanner
Chris@210 56
Chris@210 57 # Whether the object is a TokenStream.
Chris@210 58 #
Chris@210 59 # Returns false.
Chris@210 60 def stream?
Chris@210 61 false
Chris@210 62 end
Chris@210 63
Chris@210 64 # Iterates over all tokens.
Chris@210 65 #
Chris@210 66 # If a filter is given, only tokens of that kind are yielded.
Chris@210 67 def each kind_filter = nil, &block
Chris@210 68 unless kind_filter
Chris@210 69 super(&block)
Chris@210 70 else
Chris@210 71 super() do |text, kind|
Chris@210 72 next unless kind == kind_filter
Chris@210 73 yield text, kind
Chris@210 74 end
Chris@210 75 end
Chris@210 76 end
Chris@210 77
Chris@210 78 # Iterates over all text tokens.
Chris@210 79 # Range tokens like [:open, :string] are left out.
Chris@210 80 #
Chris@210 81 # Example:
Chris@210 82 # tokens.each_text_token { |text, kind| text.replace html_escape(text) }
Chris@210 83 def each_text_token
Chris@210 84 each do |text, kind|
Chris@210 85 next unless text.is_a? ::String
Chris@210 86 yield text, kind
Chris@210 87 end
Chris@210 88 end
Chris@210 89
Chris@210 90 # Encode the tokens using encoder.
Chris@210 91 #
Chris@210 92 # encoder can be
Chris@210 93 # * a symbol like :html oder :statistic
Chris@210 94 # * an Encoder class
Chris@210 95 # * an Encoder object
Chris@210 96 #
Chris@210 97 # options are passed to the encoder.
Chris@210 98 def encode encoder, options = {}
Chris@210 99 unless encoder.is_a? Encoders::Encoder
Chris@210 100 unless encoder.is_a? Class
Chris@210 101 encoder_class = Encoders[encoder]
Chris@210 102 end
Chris@210 103 encoder = encoder_class.new options
Chris@210 104 end
Chris@210 105 encoder.encode_tokens self, options
Chris@210 106 end
Chris@210 107
Chris@210 108
Chris@210 109 # Turn into a string using Encoders::Text.
Chris@210 110 #
Chris@210 111 # +options+ are passed to the encoder if given.
Chris@210 112 def to_s options = {}
Chris@210 113 encode :text, options
Chris@210 114 end
Chris@210 115
Chris@210 116 # Redirects unknown methods to encoder calls.
Chris@210 117 #
Chris@210 118 # For example, if you call +tokens.html+, the HTML encoder
Chris@210 119 # is used to highlight the tokens.
Chris@210 120 def method_missing meth, options = {}
Chris@210 121 Encoders[meth].new(options).encode_tokens self
Chris@210 122 end
Chris@210 123
Chris@210 124 # Returns the tokens compressed by joining consecutive
Chris@210 125 # tokens of the same kind.
Chris@210 126 #
Chris@210 127 # This can not be undone, but should yield the same output
Chris@210 128 # in most Encoders. It basically makes the output smaller.
Chris@210 129 #
Chris@210 130 # Combined with dump, it saves space for the cost of time.
Chris@210 131 #
Chris@210 132 # If the scanner is written carefully, this is not required -
Chris@210 133 # for example, consecutive //-comment lines could already be
Chris@210 134 # joined in one comment token by the Scanner.
Chris@210 135 def optimize
Chris@210 136 last_kind = last_text = nil
Chris@210 137 new = self.class.new
Chris@210 138 for text, kind in self
Chris@210 139 if text.is_a? String
Chris@210 140 if kind == last_kind
Chris@210 141 last_text << text
Chris@210 142 else
Chris@210 143 new << [last_text, last_kind] if last_kind
Chris@210 144 last_text = text
Chris@210 145 last_kind = kind
Chris@210 146 end
Chris@210 147 else
Chris@210 148 new << [last_text, last_kind] if last_kind
Chris@210 149 last_kind = last_text = nil
Chris@210 150 new << [text, kind]
Chris@210 151 end
Chris@210 152 end
Chris@210 153 new << [last_text, last_kind] if last_kind
Chris@210 154 new
Chris@210 155 end
Chris@210 156
Chris@210 157 # Compact the object itself; see optimize.
Chris@210 158 def optimize!
Chris@210 159 replace optimize
Chris@210 160 end
Chris@210 161
Chris@210 162 # Ensure that all :open tokens have a correspondent :close one.
Chris@210 163 #
Chris@210 164 # TODO: Test this!
Chris@210 165 def fix
Chris@210 166 tokens = self.class.new
Chris@210 167 # Check token nesting using a stack of kinds.
Chris@210 168 opened = []
Chris@210 169 for type, kind in self
Chris@210 170 case type
Chris@210 171 when :open
Chris@210 172 opened.push [:close, kind]
Chris@210 173 when :begin_line
Chris@210 174 opened.push [:end_line, kind]
Chris@210 175 when :close, :end_line
Chris@210 176 expected = opened.pop
Chris@210 177 if [type, kind] != expected
Chris@210 178 # Unexpected :close; decide what to do based on the kind:
Chris@210 179 # - token was never opened: delete the :close (just skip it)
Chris@210 180 next unless opened.rindex expected
Chris@210 181 # - token was opened earlier: also close tokens in between
Chris@210 182 tokens << token until (token = opened.pop) == expected
Chris@210 183 end
Chris@210 184 end
Chris@210 185 tokens << [type, kind]
Chris@210 186 end
Chris@210 187 # Close remaining opened tokens
Chris@210 188 tokens << token while token = opened.pop
Chris@210 189 tokens
Chris@210 190 end
Chris@210 191
Chris@210 192 def fix!
Chris@210 193 replace fix
Chris@210 194 end
Chris@210 195
Chris@210 196 # TODO: Scanner#split_into_lines
Chris@210 197 #
Chris@210 198 # Makes sure that:
Chris@210 199 # - newlines are single tokens
Chris@210 200 # (which means all other token are single-line)
Chris@210 201 # - there are no open tokens at the end the line
Chris@210 202 #
Chris@210 203 # This makes it simple for encoders that work line-oriented,
Chris@210 204 # like HTML with list-style numeration.
Chris@210 205 def split_into_lines
Chris@210 206 raise NotImplementedError
Chris@210 207 end
Chris@210 208
Chris@210 209 def split_into_lines!
Chris@210 210 replace split_into_lines
Chris@210 211 end
Chris@210 212
Chris@210 213 # Dumps the object into a String that can be saved
Chris@210 214 # in files or databases.
Chris@210 215 #
Chris@210 216 # The dump is created with Marshal.dump;
Chris@210 217 # In addition, it is gzipped using GZip.gzip.
Chris@210 218 #
Chris@210 219 # The returned String object includes Undumping
Chris@210 220 # so it has an #undump method. See Tokens.load.
Chris@210 221 #
Chris@210 222 # You can configure the level of compression,
Chris@210 223 # but the default value 7 should be what you want
Chris@210 224 # in most cases as it is a good compromise between
Chris@210 225 # speed and compression rate.
Chris@210 226 #
Chris@210 227 # See GZip module.
Chris@210 228 def dump gzip_level = 7
Chris@210 229 require 'coderay/helpers/gzip_simple'
Chris@210 230 dump = Marshal.dump self
Chris@210 231 dump = dump.gzip gzip_level
Chris@210 232 dump.extend Undumping
Chris@210 233 end
Chris@210 234
Chris@210 235 # The total size of the tokens.
Chris@210 236 # Should be equal to the input size before
Chris@210 237 # scanning.
Chris@210 238 def text_size
Chris@210 239 size = 0
Chris@210 240 each_text_token do |t, k|
Chris@210 241 size + t.size
Chris@210 242 end
Chris@210 243 size
Chris@210 244 end
Chris@210 245
Chris@210 246 # Return all text tokens joined into a single string.
Chris@210 247 def text
Chris@210 248 map { |t, k| t if t.is_a? ::String }.join
Chris@210 249 end
Chris@210 250
Chris@210 251 # Include this module to give an object an #undump
Chris@210 252 # method.
Chris@210 253 #
Chris@210 254 # The string returned by Tokens.dump includes Undumping.
Chris@210 255 module Undumping
Chris@210 256 # Calls Tokens.load with itself.
Chris@210 257 def undump
Chris@210 258 Tokens.load self
Chris@210 259 end
Chris@210 260 end
Chris@210 261
Chris@210 262 # Undump the object using Marshal.load, then
Chris@210 263 # unzip it using GZip.gunzip.
Chris@210 264 #
Chris@210 265 # The result is commonly a Tokens object, but
Chris@210 266 # this is not guaranteed.
Chris@210 267 def Tokens.load dump
Chris@210 268 require 'coderay/helpers/gzip_simple'
Chris@210 269 dump = dump.gunzip
Chris@210 270 @dump = Marshal.load dump
Chris@210 271 end
Chris@210 272
Chris@210 273 end
Chris@210 274
Chris@210 275
Chris@210 276 # = TokenStream
Chris@210 277 #
Chris@210 278 # The TokenStream class is a fake Array without elements.
Chris@210 279 #
Chris@210 280 # It redirects the method << to a block given at creation.
Chris@210 281 #
Chris@210 282 # This allows scanners and Encoders to use streaming (no
Chris@210 283 # tokens are saved, the input is highlighted the same time it
Chris@210 284 # is scanned) with the same code.
Chris@210 285 #
Chris@210 286 # See CodeRay.encode_stream and CodeRay.scan_stream
Chris@210 287 class TokenStream < Tokens
Chris@210 288
Chris@210 289 # Whether the object is a TokenStream.
Chris@210 290 #
Chris@210 291 # Returns true.
Chris@210 292 def stream?
Chris@210 293 true
Chris@210 294 end
Chris@210 295
Chris@210 296 # The Array is empty, but size counts the tokens given by <<.
Chris@210 297 attr_reader :size
Chris@210 298
Chris@210 299 # Creates a new TokenStream that calls +block+ whenever
Chris@210 300 # its << method is called.
Chris@210 301 #
Chris@210 302 # Example:
Chris@210 303 #
Chris@210 304 # require 'coderay'
Chris@210 305 #
Chris@210 306 # token_stream = CodeRay::TokenStream.new do |text, kind|
Chris@210 307 # puts 'kind: %s, text size: %d.' % [kind, text.size]
Chris@210 308 # end
Chris@210 309 #
Chris@210 310 # token_stream << ['/\d+/', :regexp]
Chris@210 311 # #-> kind: rexpexp, text size: 5.
Chris@210 312 #
Chris@210 313 def initialize &block
Chris@210 314 raise ArgumentError, 'Block expected for streaming.' unless block
Chris@210 315 @callback = block
Chris@210 316 @size = 0
Chris@210 317 end
Chris@210 318
Chris@210 319 # Calls +block+ with +token+ and increments size.
Chris@210 320 #
Chris@210 321 # Returns self.
Chris@210 322 def << token
Chris@210 323 @callback.call(*token)
Chris@210 324 @size += 1
Chris@210 325 self
Chris@210 326 end
Chris@210 327
Chris@210 328 # This method is not implemented due to speed reasons. Use Tokens.
Chris@210 329 def text_size
Chris@210 330 raise NotImplementedError,
Chris@210 331 'This method is not implemented due to speed reasons.'
Chris@210 332 end
Chris@210 333
Chris@210 334 # A TokenStream cannot be dumped. Use Tokens.
Chris@210 335 def dump
Chris@210 336 raise NotImplementedError, 'A TokenStream cannot be dumped.'
Chris@210 337 end
Chris@210 338
Chris@210 339 # A TokenStream cannot be optimized. Use Tokens.
Chris@210 340 def optimize
Chris@210 341 raise NotImplementedError, 'A TokenStream cannot be optimized.'
Chris@210 342 end
Chris@210 343
Chris@210 344 end
Chris@210 345
Chris@210 346 end
Chris@210 347
Chris@210 348 if $0 == __FILE__
Chris@210 349 $VERBOSE = true
Chris@210 350 $: << File.join(File.dirname(__FILE__), '..')
Chris@210 351 eval DATA.read, nil, $0, __LINE__ + 4
Chris@210 352 end
Chris@210 353
Chris@210 354 __END__
Chris@210 355 require 'test/unit'
Chris@210 356
Chris@210 357 class TokensTest < Test::Unit::TestCase
Chris@210 358
Chris@210 359 def test_creation
Chris@210 360 assert CodeRay::Tokens < Array
Chris@210 361 tokens = nil
Chris@210 362 assert_nothing_raised do
Chris@210 363 tokens = CodeRay::Tokens.new
Chris@210 364 end
Chris@210 365 assert_kind_of Array, tokens
Chris@210 366 end
Chris@210 367
Chris@210 368 def test_adding_tokens
Chris@210 369 tokens = CodeRay::Tokens.new
Chris@210 370 assert_nothing_raised do
Chris@210 371 tokens << ['string', :type]
Chris@210 372 tokens << ['()', :operator]
Chris@210 373 end
Chris@210 374 assert_equal tokens.size, 2
Chris@210 375 end
Chris@210 376
Chris@210 377 def test_dump_undump
Chris@210 378 tokens = CodeRay::Tokens.new
Chris@210 379 assert_nothing_raised do
Chris@210 380 tokens << ['string', :type]
Chris@210 381 tokens << ['()', :operator]
Chris@210 382 end
Chris@210 383 tokens2 = nil
Chris@210 384 assert_nothing_raised do
Chris@210 385 tokens2 = tokens.dump.undump
Chris@210 386 end
Chris@210 387 assert_equal tokens, tokens2
Chris@210 388 end
Chris@210 389
Chris@210 390 end