This scanner is really complex, since Ruby is a complex language!
It tries to highlight 100% of all common code, and 90% of strange codes.
It is optimized for HTML highlighting, and is not very useful for parsing or pretty printing.
For now, I think it’s better than the scanners in VIM or Syntax, or any highlighter I was able to find, except Caleb’s RubyLexer.
I hope it’s also better than the rdoc/irb lexer.
# File lib/coderay/scanners/ruby.rb, line 30
30: def scan_tokens tokens, options
31: last_token_dot = false
32: value_expected = true
33: heredocs = nil
34: last_state = nil
35: state = :initial
36: depth = nil
37: inline_block_stack = []
38: unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8'
39:
40: patterns = Patterns # avoid constant lookup
41:
42: until eos?
43: match = nil
44: kind = nil
45:
46: if state.instance_of? patterns::StringState
47: # {{{
48: match = scan_until(state.pattern) || scan_until(/\z/)
49: tokens << [match, :content] unless match.empty?
50: break if eos?
51:
52: if state.heredoc and self[1] # end of heredoc
53: match = getch.to_s
54: match << scan_until(/$/) unless eos?
55: tokens << [match, :delimiter]
56: tokens << [:close, state.type]
57: state = state.next_state
58: next
59: end
60:
61: case match = getch
62:
63: when state.delim
64: if state.paren
65: state.paren_depth -= 1
66: if state.paren_depth > 0
67: tokens << [match, :nesting_delimiter]
68: next
69: end
70: end
71: tokens << [match, :delimiter]
72: if state.type == :regexp and not eos?
73: modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/x)
74: tokens << [modifiers, :modifier] unless modifiers.empty?
75: end
76: tokens << [:close, state.type]
77: value_expected = false
78: state = state.next_state
79:
80: when '\'
81: if state.interpreted
82: if esc = scan(/ #{patterns::ESCAPE} /x)
83: tokens << [match + esc, :char]
84: else
85: tokens << [match, :error]
86: end
87: else
88: case m = getch
89: when state.delim, '\'
90: tokens << [match + m, :char]
91: when nil
92: tokens << [match, :error]
93: else
94: tokens << [match + m, :content]
95: end
96: end
97:
98: when '#'
99: case peek(1)
100: when '{'
101: inline_block_stack << [state, depth, heredocs]
102: value_expected = true
103: state = :initial
104: depth = 1
105: tokens << [:open, :inline]
106: tokens << [match + getch, :inline_delimiter]
107: when '$', '@'
108: tokens << [match, :escape]
109: last_state = state # scan one token as normal code, then return here
110: state = :initial
111: else
112: raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens
113: end
114:
115: when state.paren
116: state.paren_depth += 1
117: tokens << [match, :nesting_delimiter]
118:
119: when /#{patterns::REGEXP_SYMBOLS}/x
120: tokens << [match, :function]
121:
122: else
123: raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens
124:
125: end
126: next
127: # }}}
128: else
129: # {{{
130: if match = scan(/[ \t\f]+/)
131: kind = :space
132: match << scan(/\s*/) unless eos? || heredocs
133: value_expected = true if match.index(\n\)
134: tokens << [match, kind]
135: next
136:
137: elsif match = scan(/\\?\n/)
138: kind = :space
139: if match == "\n"
140: value_expected = true
141: state = :initial if state == :undef_comma_expected
142: end
143: if heredocs
144: unscan # heredoc scanning needs \n at start
145: state = heredocs.shift
146: tokens << [:open, state.type]
147: heredocs = nil if heredocs.empty?
148: next
149: else
150: match << scan(/\s*/) unless eos?
151: end
152: tokens << [match, kind]
153: next
154:
155: elsif bol? && match = scan(/\#!.*/)
156: tokens << [match, :doctype]
157: next
158:
159: elsif match = scan(/\#.*/) or
160: ( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/) )
161: kind = :comment
162: tokens << [match, kind]
163: next
164:
165: elsif state == :initial
166:
167: # IDENTS #
168: if match = scan(unicode ? /#{patterns::METHOD_NAME}/o :
169: /#{patterns::METHOD_NAME}/)
170: if last_token_dot
171: kind = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end
172: else
173: kind = patterns::IDENT_KIND[match]
174: if kind == :ident and match[/^[A-Z]/] and not match[/[!?]$/] and not match?(/\(/)
175: kind = :constant
176: elsif kind == :reserved
177: state = patterns::DEF_NEW_STATE[match]
178: value_expected = :set if patterns::KEYWORDS_EXPECTING_VALUE[match]
179: end
180: end
181: value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/)
182:
183: elsif last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}|\(/)
184: kind = :ident
185: value_expected = :set if check(/#{patterns::VALUE_FOLLOWS}/)
186:
187: # OPERATORS #
188: elsif not last_token_dot and match = scan(/ \.\.\.? | (?:\.|::)() | [,\(\)\[\]\{\}] | ==?=? /)
189: if match !~ / [.\)\]\}] / or match =~ /\.\.\.?/
190: value_expected = :set
191: end
192: last_token_dot = :set if self[1]
193: kind = :operator
194: unless inline_block_stack.empty?
195: case match
196: when '{'
197: depth += 1
198: when '}'
199: depth -= 1
200: if depth == 0 # closing brace of inline block reached
201: state, depth, heredocs = inline_block_stack.pop
202: heredocs = nil if heredocs && heredocs.empty?
203: tokens << [match, :inline_delimiter]
204: kind = :inline
205: match = :close
206: end
207: end
208: end
209:
210: elsif match = scan(/ ['"] /x)
211: tokens << [:open, :string]
212: kind = :delimiter
213: state = patterns::StringState.new :string, match == '"', match # important for streaming
214:
215: elsif match = scan(/#{patterns::INSTANCE_VARIABLE}/)
216: kind = :instance_variable
217:
218: elsif value_expected and match = scan(/\//)
219: tokens << [:open, :regexp]
220: kind = :delimiter
221: interpreted = true
222: state = patterns::StringState.new :regexp, interpreted, match
223:
224: # elsif match = scan(/[-+]?#{patterns::NUMERIC}/o)
225: elsif match = value_expected ? scan(/[-+]?#{patterns::NUMERIC}/) : scan(/#{patterns::NUMERIC}/)
226: kind = self[1] ? :float : :integer
227:
228: elsif match = scan(/#{patterns::SYMBOL}/)
229: case delim = match[1]
230: when '', ""
231: tokens << [:open, :symbol]
232: tokens << [':', :symbol]
233: match = delim.chr
234: kind = :delimiter
235: state = patterns::StringState.new :symbol, delim == "", match
236: else
237: kind = :symbol
238: end
239:
240: elsif match = scan(/ [-+!~^]=? | [*|&]{1,2}=? | >>? /)
241: value_expected = :set
242: kind = :operator
243:
244: elsif value_expected and match = scan(/#{patterns::HEREDOC_OPEN}/)
245: indented = self[1] == '-'
246: quote = self[3]
247: delim = self[quote ? 4 : 2]
248: kind = patterns::QUOTE_TO_TYPE[quote]
249: tokens << [:open, kind]
250: tokens << [match, :delimiter]
251: match = :close
252: heredoc = patterns::StringState.new kind, quote != '\', delim, (indented ? :indented : :linestart )
253: heredocs ||= [] # create heredocs if empty
254: heredocs << heredoc
255:
256: elsif value_expected and match = scan(/#{patterns::FANCY_START_CORRECT}/)
257: kind, interpreted = *patterns::FancyStringType.fetch(self[1]) do
258: raise_inspect 'Unknown fancy string: %%%p' % k, tokens
259: end
260: tokens << [:open, kind]
261: state = patterns::StringState.new kind, interpreted, self[2]
262: kind = :delimiter
263:
264: elsif value_expected and match = scan(/#{patterns::CHARACTER}/)
265: kind = :integer
266:
267: elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /)
268: value_expected = :set
269: kind = :operator
270:
271: elsif match = scan(/`/)
272: if last_token_dot
273: kind = :operator
274: else
275: tokens << [:open, :shell]
276: kind = :delimiter
277: state = patterns::StringState.new :shell, true, match
278: end
279:
280: elsif match = scan(/#{patterns::GLOBAL_VARIABLE}/)
281: kind = :global_variable
282:
283: elsif match = scan(/#{patterns::CLASS_VARIABLE}/)
284: kind = :class_variable
285:
286: else
287: if !unicode
288: # check for unicode
289: debug, $DEBUG = $DEBUG, false
290: begin
291: if check(/./u).size > 1
292: # seems like we should try again with unicode
293: unicode = true
294: end
295: rescue
296: # bad unicode char; use getch
297: ensure
298: $DEBUG = debug
299: end
300: next if unicode
301: end
302: kind = :error
303: match = getch
304:
305: end
306:
307: elsif state == :def_expected
308: state = :initial
309: if scan(/self\./)
310: tokens << ['self', :pre_constant]
311: tokens << ['.', :operator]
312: end
313: if match = scan(unicode ? /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o :
314: /(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/)
315: kind = :method
316: else
317: next
318: end
319:
320: elsif state == :module_expected
321: if match = scan(/<</)
322: kind = :operator
323: else
324: state = :initial
325: if match = scan(/ (?:#{patterns::IDENT}::)* #{patterns::IDENT} /x)
326: kind = :class
327: else
328: next
329: end
330: end
331:
332: elsif state == :undef_expected
333: state = :undef_comma_expected
334: if match = scan(/#{patterns::METHOD_NAME_EX}/)
335: kind = :method
336: elsif match = scan(/#{patterns::SYMBOL}/)
337: case delim = match[1]
338: when '', ""
339: tokens << [:open, :symbol]
340: tokens << [':', :symbol]
341: match = delim.chr
342: kind = :delimiter
343: state = patterns::StringState.new :symbol, delim == "", match
344: state.next_state = :undef_comma_expected
345: else
346: kind = :symbol
347: end
348: else
349: state = :initial
350: next
351: end
352:
353: elsif state == :alias_expected
354: match = scan(unicode ? /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/o :
355: /(#{patterns::METHOD_NAME_OR_SYMBOL})([ \t]+)(#{patterns::METHOD_NAME_OR_SYMBOL})/)
356:
357: if match
358: tokens << [self[1], (self[1][0] == :: ? :symbol : :method)]
359: tokens << [self[2], :space]
360: tokens << [self[3], (self[3][0] == :: ? :symbol : :method)]
361: end
362: state = :initial
363: next
364:
365: elsif state == :undef_comma_expected
366: if match = scan(/,/)
367: kind = :operator
368: state = :undef_expected
369: else
370: state = :initial
371: next
372: end
373:
374: end
375: # }}}
376:
377: unless kind == :error
378: value_expected = value_expected == :set
379: last_token_dot = last_token_dot == :set
380: end
381:
382: if $CODERAY_DEBUG and not kind
383: raise_inspect 'Error token %p in line %d' %
384: [[match, kind], line], tokens, state
385: end
386: raise_inspect 'Empty token', tokens unless match
387:
388: tokens << [match, kind]
389:
390: if last_state
391: state = last_state
392: last_state = nil
393: end
394: end
395: end
396:
397: inline_block_stack << [state] if state.is_a? patterns::StringState
398: until inline_block_stack.empty?
399: this_block = inline_block_stack.pop
400: tokens << [:close, :inline] if this_block.size > 1
401: state = this_block.first
402: tokens << [:close, state.type]
403: end
404:
405: tokens
406: end
Disabled; run with --debug to generate this.
Generated with the Darkfish Rdoc Generator 1.1.6.