To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / .svn / pristine / 78 / 78a1de16a7f51fffc3afacfeda93104b299506dc.svn-base @ 1297:0a574315af3e

History | View | Annotate | Download (9.13 KB)

1
# encoding: utf-8
2
require 'strscan'
3

    
4
module CodeRay
5

    
6
  autoload :WordList, 'coderay/helpers/word_list'
7
  
8
  # = Scanners
9
  #
10
  # This module holds the Scanner class and its subclasses.
11
  # For example, the Ruby scanner is named CodeRay::Scanners::Ruby
12
  # can be found in coderay/scanners/ruby.
13
  #
14
  # Scanner also provides methods and constants for the register
15
  # mechanism and the [] method that returns the Scanner class
16
  # belonging to the given lang.
17
  #
18
  # See PluginHost.
19
  module Scanners
20
    extend PluginHost
21
    plugin_path File.dirname(__FILE__), 'scanners'
22
    
23
    
24
    # = Scanner
25
    #
26
    # The base class for all Scanners.
27
    #
28
    # It is a subclass of Ruby's great +StringScanner+, which
29
    # makes it easy to access the scanning methods inside.
30
    #
31
    # It is also +Enumerable+, so you can use it like an Array of
32
    # Tokens:
33
    #
34
    #   require 'coderay'
35
    #   
36
    #   c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"
37
    #   
38
    #   for text, kind in c_scanner
39
    #     puts text if kind == :operator
40
    #   end
41
    #   
42
    #   # prints: (*==)++;
43
    #
44
    # OK, this is a very simple example :)
45
    # You can also use +map+, +any?+, +find+ and even +sort_by+,
46
    # if you want.
47
    class Scanner < StringScanner
48
      
49
      extend Plugin
50
      plugin_host Scanners
51
      
52
      # Raised if a Scanner fails while scanning
53
      ScanError = Class.new StandardError
54
      
55
      # The default options for all scanner classes.
56
      #
57
      # Define @default_options for subclasses.
58
      DEFAULT_OPTIONS = { }
59
      
60
      KINDS_NOT_LOC = [:comment, :doctype, :docstring]
61
      
62
      attr_accessor :state
63
      
64
      class << self
65
        
66
        # Normalizes the given code into a string with UNIX newlines, in the
67
        # scanner's internal encoding, with invalid and undefined charachters
68
        # replaced by placeholders. Always returns a new object.
69
        def normalize code
70
          # original = code
71
          code = code.to_s unless code.is_a? ::String
72
          return code if code.empty?
73
          
74
          if code.respond_to? :encoding
75
            code = encode_with_encoding code, self.encoding
76
          else
77
            code = to_unix code
78
          end
79
          # code = code.dup if code.eql? original
80
          code
81
        end
82
        
83
        # The typical filename suffix for this scanner's language.
84
        def file_extension extension = lang
85
          @file_extension ||= extension.to_s
86
        end
87
        
88
        # The encoding used internally by this scanner.
89
        def encoding name = 'UTF-8'
90
          @encoding ||= defined?(Encoding.find) && Encoding.find(name)
91
        end
92
        
93
        # The lang of this Scanner class, which is equal to its Plugin ID.
94
        def lang
95
          @plugin_id
96
        end
97
        
98
      protected
99
        
100
        def encode_with_encoding code, target_encoding
101
          if code.encoding == target_encoding
102
            if code.valid_encoding?
103
              return to_unix(code)
104
            else
105
              source_encoding = guess_encoding code
106
            end
107
          else
108
            source_encoding = code.encoding
109
          end
110
          # print "encode_with_encoding from #{source_encoding} to #{target_encoding}"
111
          code.encode target_encoding, source_encoding, :universal_newline => true, :undef => :replace, :invalid => :replace
112
        end
113
        
114
        def to_unix code
115
          code.index(?\r) ? code.gsub(/\r\n?/, "\n") : code
116
        end
117
        
118
        def guess_encoding s
119
          #:nocov:
120
          IO.popen("file -b --mime -", "w+") do |file|
121
            file.write s[0, 1024]
122
            file.close_write
123
            begin
124
              Encoding.find file.gets[/charset=([-\w]+)/, 1]
125
            rescue ArgumentError
126
              Encoding::BINARY
127
            end
128
          end
129
          #:nocov:
130
        end
131
        
132
      end
133
      
134
      # Create a new Scanner.
135
      #
136
      # * +code+ is the input String and is handled by the superclass
137
      #   StringScanner.
138
      # * +options+ is a Hash with Symbols as keys.
139
      #   It is merged with the default options of the class (you can
140
      #   overwrite default options here.)
141
      #
142
      # Else, a Tokens object is used.
143
      def initialize code = '', options = {}
144
        if self.class == Scanner
145
          raise NotImplementedError, "I am only the basic Scanner class. I can't scan anything. :( Use my subclasses."
146
        end
147
        
148
        @options = self.class::DEFAULT_OPTIONS.merge options
149
        
150
        super self.class.normalize(code)
151
        
152
        @tokens = options[:tokens] || Tokens.new
153
        @tokens.scanner = self if @tokens.respond_to? :scanner=
154
        
155
        setup
156
      end
157
      
158
      # Sets back the scanner. Subclasses should redefine the reset_instance
159
      # method instead of this one.
160
      def reset
161
        super
162
        reset_instance
163
      end
164
      
165
      # Set a new string to be scanned.
166
      def string= code
167
        code = self.class.normalize(code)
168
        super code
169
        reset_instance
170
      end
171
      
172
      # the Plugin ID for this scanner
173
      def lang
174
        self.class.lang
175
      end
176
      
177
      # the default file extension for this scanner
178
      def file_extension
179
        self.class.file_extension
180
      end
181
      
182
      # Scan the code and returns all tokens in a Tokens object.
183
      def tokenize source = nil, options = {}
184
        options = @options.merge(options)
185
        @tokens = options[:tokens] || @tokens || Tokens.new
186
        @tokens.scanner = self if @tokens.respond_to? :scanner=
187
        case source
188
        when Array
189
          self.string = self.class.normalize(source.join)
190
        when nil
191
          reset
192
        else
193
          self.string = self.class.normalize(source)
194
        end
195
        
196
        begin
197
          scan_tokens @tokens, options
198
        rescue => e
199
          message = "Error in %s#scan_tokens, initial state was: %p" % [self.class, defined?(state) && state]
200
          raise_inspect e.message, @tokens, message, 30, e.backtrace
201
        end
202
        
203
        @cached_tokens = @tokens
204
        if source.is_a? Array
205
          @tokens.split_into_parts(*source.map { |part| part.size })
206
        else
207
          @tokens
208
        end
209
      end
210
      
211
      # Cache the result of tokenize.
212
      def tokens
213
        @cached_tokens ||= tokenize
214
      end
215
      
216
      # Traverse the tokens.
217
      def each &block
218
        tokens.each(&block)
219
      end
220
      include Enumerable
221
      
222
      # The current line position of the scanner, starting with 1.
223
      # See also: #column.
224
      #
225
      # Beware, this is implemented inefficiently. It should be used
226
      # for debugging only.
227
      def line pos = self.pos
228
        return 1 if pos <= 0
229
        binary_string[0...pos].count("\n") + 1
230
      end
231
      
232
      # The current column position of the scanner, starting with 1.
233
      # See also: #line.
234
      def column pos = self.pos
235
        return 1 if pos <= 0
236
        pos - (binary_string.rindex(?\n, pos - 1) || -1)
237
      end
238
      
239
      # The string in binary encoding.
240
      # 
241
      # To be used with #pos, which is the index of the byte the scanner
242
      # will scan next.
243
      def binary_string
244
        @binary_string ||=
245
          if string.respond_to?(:bytesize) && string.bytesize != string.size
246
            #:nocov:
247
            string.dup.force_encoding('binary')
248
            #:nocov:
249
          else
250
            string
251
          end
252
      end
253
      
254
    protected
255
      
256
      # Can be implemented by subclasses to do some initialization
257
      # that has to be done once per instance.
258
      #
259
      # Use reset for initialization that has to be done once per
260
      # scan.
261
      def setup  # :doc:
262
      end
263
      
264
      # This is the central method, and commonly the only one a
265
      # subclass implements.
266
      #
267
      # Subclasses must implement this method; it must return +tokens+
268
      # and must only use Tokens#<< for storing scanned tokens!
269
      def scan_tokens tokens, options  # :doc:
270
        raise NotImplementedError, "#{self.class}#scan_tokens not implemented."
271
      end
272
      
273
      # Resets the scanner.
274
      def reset_instance
275
        @tokens.clear if @tokens.respond_to?(:clear) && !@options[:keep_tokens]
276
        @cached_tokens = nil
277
        @binary_string = nil if defined? @binary_string
278
      end
279
      
280
      # Scanner error with additional status information
281
      def raise_inspect msg, tokens, state = self.state || 'No state given!', ambit = 30, backtrace = caller
282
        raise ScanError, <<-EOE % [
283

    
284

    
285
***ERROR in %s: %s (after %d tokens)
286

    
287
tokens:
288
%s
289

    
290
current line: %d  column: %d  pos: %d
291
matched: %p  state: %p
292
bol? = %p,  eos? = %p
293

    
294
surrounding code:
295
%p  ~~  %p
296

    
297

    
298
***ERROR***
299

    
300
        EOE
301
          File.basename(caller[0]),
302
          msg,
303
          tokens.respond_to?(:size) ? tokens.size : 0,
304
          tokens.respond_to?(:last) ? tokens.last(10).map { |t| t.inspect }.join("\n") : '',
305
          line, column, pos,
306
          matched, state, bol?, eos?,
307
          binary_string[pos - ambit, ambit],
308
          binary_string[pos, ambit],
309
        ], backtrace
310
      end
311
      
312
      # Shorthand for scan_until(/\z/).
313
      # This method also avoids a JRuby 1.9 mode bug.
314
      def scan_rest
315
        rest = self.rest
316
        terminate
317
        rest
318
      end
319
      
320
    end
321
    
322
  end
323
end