diff --git a/Rakefile b/Rakefile index df4cc88..e849857 100644 --- a/Rakefile +++ b/Rakefile @@ -5,10 +5,10 @@ require 'cliver' GEMSPEC = Gem::Specification.load('oga.gemspec') -LEXER_INPUT = 'lib/oga/lexer.rl' -LEXER_OUTPUT = 'lib/oga/lexer.rb' +LEXER_INPUT = 'lib/oga/xml/lexer.rl' +LEXER_OUTPUT = 'lib/oga/xml/lexer.rb' -HTML_PARSER = 'lib/oga/parser.rb' +HTML_PARSER = 'lib/oga/xml/parser.rb' GENERATED_FILES = ['coverage', 'yardoc', LEXER_OUTPUT, HTML_PARSER] diff --git a/benchmark/lexer/bench_cdata.rb b/benchmark/lexer/bench_cdata.rb index 7cf11fd..fd84b2c 100644 --- a/benchmark/lexer/bench_cdata.rb +++ b/benchmark/lexer/bench_cdata.rb @@ -5,7 +5,7 @@ string = 'Hello, how are you doing today?' small = "" medium = "" large = "" -lexer = Oga::Lexer.new +lexer = Oga::XML::Lexer.new Benchmark.ips do |bench| bench.report 'CDATA with a small body' do diff --git a/benchmark/lexer/bench_element.rb b/benchmark/lexer/bench_element.rb index ed3c760..93a4674 100644 --- a/benchmark/lexer/bench_element.rb +++ b/benchmark/lexer/bench_element.rb @@ -4,7 +4,7 @@ require 'benchmark/ips' simple = '

Hello world

' attributes = '

Hello world

' nested = '

Helloworld

' -lexer = Oga::Lexer.new +lexer = Oga::XML::Lexer.new Benchmark.ips do |bench| bench.report 'text only' do diff --git a/benchmark/lexer/bench_html.rb b/benchmark/lexer/bench_html.rb index d75bd3a..213ea80 100644 --- a/benchmark/lexer/bench_html.rb +++ b/benchmark/lexer/bench_html.rb @@ -2,7 +2,7 @@ require_relative '../../lib/oga' require 'benchmark/ips' html = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__)) -lexer = Oga::Lexer.new(:html => true) +lexer = Oga::XML::Lexer.new(:html => true) Benchmark.ips do |bench| bench.report 'lex HTML' do diff --git a/benchmark/lexer/bench_html_time.rb b/benchmark/lexer/bench_html_time.rb index d4d08f1..131b203 100644 --- a/benchmark/lexer/bench_html_time.rb +++ b/benchmark/lexer/bench_html_time.rb @@ -2,7 +2,7 @@ require_relative '../../lib/oga' require 'benchmark' html = File.read(File.expand_path('../../fixtures/hrs.html', __FILE__)) -lexer = Oga::Lexer.new(:html => true) +lexer = Oga::XML::Lexer.new(:html => true) Benchmark.bmbm(20) do |bench| bench.report 'lex HTML' do diff --git a/lib/oga.rb b/lib/oga.rb index 4952316..f065fbc 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -1,5 +1,5 @@ require 'ast' require_relative 'oga/ast/node' -require_relative 'oga/lexer' -require_relative 'oga/parser' +require_relative 'oga/xml/lexer' +require_relative 'oga/xml/parser' diff --git a/lib/oga/lexer.rl b/lib/oga/lexer.rl deleted file mode 100644 index 3f00c07..0000000 --- a/lib/oga/lexer.rl +++ /dev/null @@ -1,508 +0,0 @@ -%%machine lexer; # % - -module Oga - ## - # Low level lexer that supports both XML and HTML (using an extra option). To - # lex HTML input set the `:html` option to `true` when creating an instance - # of the lexer: - # - # lexer = Oga::Lexer.new(:html => true) - # - # @!attribute [r] html - # @return [TrueClass|FalseClass] - # - class Lexer - %% write data; # % - - attr_reader :html - - ## - # Names of the HTML void elements that should be handled when HTML lexing - # is enabled. - # - # @return [Array] - # - HTML_VOID_ELEMENTS = [ - 'area', - 'base', - 'br', - 'col', - 'command', - 'embed', - 'hr', - 'img', - 'input', - 'keygen', - 'link', - 'meta', - 'param', - 'source', - 'track', - 'wbr' - ] - - # Lazy way of forwarding instance method calls used internally by Ragel to - # their corresponding class methods. - private_methods.grep(/^_lexer_/).each do |name| - define_method(name) do - return self.class.send(name) - end - - private(name) - end - - ## - # @param [Hash] options - # - # @option options [Symbol] :html When set to `true` the lexer will treat - # the input as HTML instead of SGML/XML. This makes it possible to lex - # HTML void elements such as ``. - # - def initialize(options = {}) - options.each do |key, value| - instance_variable_set("@#{key}", value) if respond_to?(key) - end - - reset - end - - ## - # Resets the internal state of the lexer. Typically you don't need to call - # this method yourself as its called by #lex after lexing a given String. - # - def reset - @line = 1 - @data = nil - @ts = nil - @te = nil - @tokens = [] - @stack = [] - @top = 0 - @elements = [] - - @buffer_start_position = nil - end - - ## - # Lexes the supplied String and returns an Array of tokens. Each token is - # an Array in the following format: - # - # [TYPE, VALUE] - # - # The type is a symbol, the value is either nil or a String. - # - # @param [String] data The string to lex. - # @return [Array] - # - def lex(data) - @data = data.unpack('U*') - lexer_start = self.class.lexer_start - eof = data.length - - %% write init; - %% write exec; - - tokens = @tokens - - reset - - return tokens - end - - ## - # @return [TrueClass|FalseClass] - # - def html? - return !!html - end - - private - - ## - # @param [Fixnum] amount The amount of lines to advance. - # - def advance_line(amount = 1) - @line += amount - end - - ## - # Emits a token who's value is based on the supplied start/stop position. - # - # @param [Symbol] type The token type. - # @param [Fixnum] start - # @param [Fixnum] stop - # - # @see #text - # @see #add_token - # - def t(type, start = @ts, stop = @te) - value = text(start, stop) - - add_token(type, value) - end - - ## - # Returns the text of the current buffer based on the supplied start and - # stop position. - # - # By default `@ts` and `@te` are used as the start/stop position. - # - # @param [Fixnum] start - # @param [Fixnum] stop - # @return [String] - # - def text(start = @ts, stop = @te) - return @data[start...stop].pack('U*') - end - - ## - # Adds a token with the given type and value to the list. - # - # @param [Symbol] type The token type. - # @param [String] value The token value. - # - def add_token(type, value = nil) - token = [type, value, @line] - - @tokens << token - end - - ## - # Enables buffering starting at the given position. - # - # @param [Fixnum] position The start position of the buffer, set to `@te` - # by default. - # - def start_buffer(position = @te) - @buffer_start_position = position - end - - ## - # Returns `true` if we're currently buffering. - # - # @return [TrueClass|FalseClass] - # - def buffering? - return !!@buffer_start_position - end - - ## - # Emits the current buffer if we have any. The current line number is - # advanced based on the amount of newlines in the buffer. - # - # @param [Fixnum] position The end position of the buffer, set to `@ts` by - # default. - # - # @param [Symbol] type The type of node to emit. - # - def emit_buffer(position = @ts, type = :T_TEXT) - return unless @buffer_start_position - - content = text(@buffer_start_position, position) - - unless content.empty? - add_token(type, content) - - lines = content.count("\n") - - advance_line(lines) if lines > 0 - end - - @buffer_start_position = nil - end - - ## - # Returns the name of the element we're currently in. - # - # @return [String] - # - def current_element - return @elements.last - end - - %%{ - # Use instance variables for `ts` and friends. - access @; - getkey (@data[p] || 0); - - newline = '\n' | '\r\n'; - whitespace = [ \t]; - - # Strings - # - # Strings in HTML can either be single or double quoted. If a string - # starts with one of these quotes it must be closed with the same type of - # quote. - dquote = '"'; - squote = "'"; - - action start_string_dquote { - start_buffer - - fcall string_dquote; - } - - action start_string_squote { - start_buffer - - fcall string_squote; - } - - # Machine for processing double quoted strings. - string_dquote := |* - dquote => { - emit_buffer(@ts, :T_STRING) - fret; - }; - - any; - *|; - - # Machine for processing single quoted strings. - string_squote := |* - squote => { - emit_buffer(@ts, :T_STRING) - fret; - }; - - any; - *|; - - # DOCTYPES - # - # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax - # - # These rules support the 3 flavours of doctypes: - # - # 1. Normal doctypes, as introduced in the HTML5 specification. - # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. - # 3. Legacy doctypes - # - doctype_start = ' { t(:T_DOCTYPE_TYPE) }; - - # Lex the public/system IDs as regular strings. - dquote => start_string_dquote; - squote => start_string_squote; - - # Whitespace inside doctypes is ignored since there's no point in - # including it. - whitespace; - - '>' => { - add_token(:T_DOCTYPE_END) - fret; - }; - *|; - - # CDATA - # - # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections - # - # CDATA tags are broken up into 3 parts: the start, the content and the - # end tag. - # - # In HTML CDATA tags have no meaning/are not supported. Oga does - # support them but treats their contents as plain text. - # - cdata_start = ''; - - action start_cdata { - emit_buffer - add_token(:T_CDATA_START) - - start_buffer - - fcall cdata; - } - - # Machine that for processing the contents of CDATA tags. Everything - # inside a CDATA tag is treated as plain text. - cdata := |* - cdata_end => { - emit_buffer - add_token(:T_CDATA_END) - - fret; - }; - - any; - *|; - - # Comments - # - # http://www.w3.org/TR/html-markup/syntax.html#comments - # - # Comments are lexed into 3 parts: the start tag, the content and the end - # tag. - # - # Unlike the W3 specification these rules *do* allow character sequences - # such as `--` and `->`. Putting extra checks in for these sequences - # would actually make the rules/actions more complex. - # - comment_start = ''; - - action start_comment { - emit_buffer - add_token(:T_COMMENT_START) - - start_buffer - - fcall comment; - } - - # Machine used for processing the contents of a comment. Everything - # inside a comment is treated as plain text (similar to CDATA tags). - comment := |* - comment_end => { - emit_buffer - add_token(:T_COMMENT_END) - - fret; - }; - - any; - *|; - - # XML declaration tags - # - # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd - # - xml_decl_start = ''; - - action start_xml_decl { - emit_buffer - add_token(:T_XML_DECL_START) - - start_buffer - - fcall xml_decl; - } - - # Machine that processes the contents of an XML declaration tag. - xml_decl := |* - xml_decl_end => { - emit_buffer - add_token(:T_XML_DECL_END) - - fret; - }; - - any; - *|; - - # Elements - # - # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements - # - - # Action that creates the tokens for the opening tag, name and namespace - # (if any). Remaining work is delegated to a dedicated machine. - action start_element { - emit_buffer - add_token(:T_ELEM_START) - - # Add the element name. If the name includes a namespace we'll break - # the name up into two separate tokens. - name = text(@ts + 1) - - if name.include?(':') - ns, name = name.split(':') - - add_token(:T_ELEM_NS, ns) - end - - @elements << name - - add_token(:T_ELEM_NAME, name) - - fcall element_head; - } - - element_name = [a-zA-Z0-9\-_:]+; - element_start = '<' element_name; - - # Machine used for processing the characters inside a element head. An - # element head is everything between ``. - # - # For example, in `

` the element head is ` foo="bar"`. - # - element_head := |* - whitespace | '='; - - newline => { advance_line }; - - # Attribute names. - element_name => { t(:T_ATTR) }; - - # Attribute values. - dquote => start_string_dquote; - squote => start_string_squote; - - # The closing character of the open tag. - ('>' | '/') => { - fhold; - fret; - }; - *|; - - main := |* - element_start => start_element; - doctype_start => start_doctype; - cdata_start => start_cdata; - comment_start => start_comment; - xml_decl_start => start_xml_decl; - - # Enter the body of the tag. If HTML mode is enabled and the current - # element is a void element we'll close it and bail out. - '>' => { - if html? and HTML_VOID_ELEMENTS.include?(current_element) - add_token(:T_ELEM_END, nil) - @elements.pop - end - }; - - # Regular closing tags. - '' => { - emit_buffer - add_token(:T_ELEM_END, nil) - - @elements.pop - }; - - # Self closing elements that are not handled by the HTML mode. - '/>' => { - add_token(:T_ELEM_END, nil) - - @elements.pop - }; - - # Note that this rule should be declared at the very bottom as it will - # otherwise take precedence over the other rules. - any => { - # First character, start buffering (unless we already are buffering). - start_buffer(@ts) unless buffering? - - # EOF, emit the text buffer. - if @te == eof - emit_buffer(@te) - end - }; - *|; - }%% - end # Lexer -end # Oga diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb new file mode 100644 index 0000000..9608aec --- /dev/null +++ b/lib/oga/xml/lexer.rb @@ -0,0 +1,1108 @@ + +# line 1 "lib/oga/xml/lexer.rl" + +# line 3 "lib/oga/xml/lexer.rl" +module Oga + module XML + ## + # Low level lexer that supports both XML and HTML (using an extra option). To + # lex HTML input set the `:html` option to `true` when creating an instance + # of the lexer: + # + # lexer = Oga::Lexer.new(:html => true) + # + # @!attribute [r] html + # @return [TrueClass|FalseClass] + # + class Lexer + +# line 20 "lib/oga/xml/lexer.rb" +class << self + attr_accessor :_lexer_trans_keys + private :_lexer_trans_keys, :_lexer_trans_keys= +end +self._lexer_trans_keys = [ + 0, 0, 45, 100, 45, 45, + 79, 111, 67, 99, 84, + 116, 89, 121, 80, 112, + 69, 101, 9, 32, 9, 104, + 84, 116, 77, 109, 76, + 108, 67, 67, 68, 68, + 65, 65, 84, 84, 65, 65, + 91, 91, 45, 122, 45, + 122, 120, 120, 109, 109, + 108, 108, 85, 85, 66, 66, + 76, 76, 73, 73, 67, + 67, 89, 89, 83, 83, + 84, 84, 69, 69, 77, 77, + 62, 62, 62, 62, 10, + 10, 47, 62, 62, 62, + 33, 122, 45, 122, 34, 34, + 39, 39, 9, 83, 93, + 93, 93, 93, 45, 45, + 45, 45, 63, 63, 62, 62, + 9, 122, 45, 122, 0 +] + +class << self + attr_accessor :_lexer_key_spans + private :_lexer_key_spans, :_lexer_key_spans= +end +self._lexer_key_spans = [ + 0, 56, 1, 33, 33, 33, 33, 33, + 33, 24, 96, 33, 33, 33, 1, 1, + 1, 1, 1, 1, 78, 78, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 16, 1, + 90, 78, 1, 1, 75, 1, 1, 1, + 1, 1, 1, 114, 78 +] + +class << self + attr_accessor :_lexer_index_offsets + private :_lexer_index_offsets, :_lexer_index_offsets= +end +self._lexer_index_offsets = [ + 0, 0, 57, 59, 93, 127, 161, 195, + 229, 263, 288, 385, 419, 453, 487, 489, + 491, 493, 495, 497, 499, 578, 657, 659, + 661, 663, 665, 667, 669, 671, 673, 675, + 677, 679, 681, 683, 685, 687, 689, 706, + 708, 799, 878, 880, 882, 958, 960, 962, + 964, 966, 968, 970, 1085 +] + +class << self + attr_accessor :_lexer_indicies + private :_lexer_indicies, :_lexer_indicies= +end +self._lexer_indicies = [ + 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 3, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 0, 4, 0, 5, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 0, 6, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 0, 7, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 0, 8, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 0, 9, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 9, 0, 10, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 10, 0, 11, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 11, 0, + 11, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 11, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 12, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 12, + 0, 13, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 13, 0, 14, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 14, 0, 15, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 15, 0, 16, + 0, 17, 0, 18, 0, 19, 0, 20, + 0, 21, 0, 22, 0, 0, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 0, 0, 0, 0, 0, 0, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 0, 0, 0, 0, 22, 0, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 0, 22, 0, 0, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 0, 0, 0, 23, 0, 0, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 0, 0, 0, 0, 22, 0, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 0, 24, 0, 25, 0, 26, 0, 27, + 28, 29, 28, 30, 28, 31, 28, 32, + 28, 33, 28, 34, 28, 35, 28, 36, + 28, 32, 28, 38, 37, 40, 39, 41, + 28, 43, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 44, 42, + 45, 42, 47, 46, 48, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 49, 46, 50, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 46, 46, + 46, 46, 51, 46, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 46, 46, + 46, 46, 49, 46, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 46, 49, + 52, 52, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 52, 52, 52, + 52, 52, 52, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 52, 52, 52, + 52, 49, 52, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 52, 54, 53, + 56, 55, 57, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 57, 28, 58, 28, 28, 28, 28, + 59, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 60, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 61, 28, 28, 62, 28, 64, 63, + 66, 65, 68, 67, 70, 69, 72, 71, + 74, 73, 75, 41, 28, 28, 76, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 75, 28, 77, 28, 28, 28, 28, + 78, 28, 28, 28, 28, 28, 79, 28, + 80, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 28, 28, 75, 80, + 28, 28, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 28, 28, 28, 28, + 79, 28, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 28, 79, 81, 81, + 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 81, 81, 81, 81, 81, + 81, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 81, 81, 81, 81, 79, + 81, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 79, 79, 79, 79, 79, + 79, 79, 79, 81, 0 +] + +class << self + attr_accessor :_lexer_trans_targs + private :_lexer_trans_targs, :_lexer_trans_targs= +end +self._lexer_trans_targs = [ + 38, 2, 3, 14, 38, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 38, + 15, 16, 17, 18, 19, 38, 21, 38, + 23, 24, 38, 26, 0, 27, 28, 29, + 44, 31, 32, 33, 34, 45, 45, 47, + 47, 51, 38, 39, 40, 38, 38, 38, + 1, 41, 20, 22, 38, 42, 42, 43, + 43, 44, 44, 44, 44, 25, 30, 45, + 46, 45, 35, 47, 48, 47, 36, 49, + 50, 49, 49, 51, 37, 51, 51, 52, + 51, 51 +] + +class << self + attr_accessor :_lexer_trans_actions + private :_lexer_trans_actions, :_lexer_trans_actions= +end +self._lexer_trans_actions = [ + 1, 0, 0, 0, 2, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, + 0, 0, 0, 0, 0, 4, 0, 5, + 0, 0, 6, 0, 0, 0, 0, 0, + 7, 0, 0, 0, 0, 8, 9, 10, + 11, 12, 15, 0, 16, 17, 18, 19, + 0, 0, 0, 0, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 0, 0, 29, + 16, 30, 0, 31, 16, 32, 0, 33, + 0, 34, 35, 36, 0, 37, 38, 0, + 39, 40 +] + +class << self + attr_accessor :_lexer_to_state_actions + private :_lexer_to_state_actions, :_lexer_to_state_actions= +end +self._lexer_to_state_actions = [ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 13, 0, + 0, 0, 13, 13, 13, 13, 0, 13, + 0, 13, 0, 13, 0 +] + +class << self + attr_accessor :_lexer_from_state_actions + private :_lexer_from_state_actions, :_lexer_from_state_actions= +end +self._lexer_from_state_actions = [ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 14, 0, + 0, 0, 14, 14, 14, 14, 0, 14, + 0, 14, 0, 14, 0 +] + +class << self + attr_accessor :_lexer_eof_trans + private :_lexer_eof_trans, :_lexer_eof_trans= +end +self._lexer_eof_trans = [ + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 38, 40, 0, 0, 47, + 47, 53, 0, 0, 0, 0, 66, 0, + 70, 0, 74, 0, 82 +] + +class << self + attr_accessor :lexer_start +end +self.lexer_start = 38; +class << self + attr_accessor :lexer_first_final +end +self.lexer_first_final = 38; +class << self + attr_accessor :lexer_error +end +self.lexer_error = 0; + +class << self + attr_accessor :lexer_en_string_dquote +end +self.lexer_en_string_dquote = 42; +class << self + attr_accessor :lexer_en_string_squote +end +self.lexer_en_string_squote = 43; +class << self + attr_accessor :lexer_en_doctype +end +self.lexer_en_doctype = 44; +class << self + attr_accessor :lexer_en_cdata +end +self.lexer_en_cdata = 45; +class << self + attr_accessor :lexer_en_comment +end +self.lexer_en_comment = 47; +class << self + attr_accessor :lexer_en_xml_decl +end +self.lexer_en_xml_decl = 49; +class << self + attr_accessor :lexer_en_element_head +end +self.lexer_en_element_head = 51; +class << self + attr_accessor :lexer_en_main +end +self.lexer_en_main = 38; + + +# line 18 "lib/oga/xml/lexer.rl" + attr_reader :html + + ## + # Names of the HTML void elements that should be handled when HTML lexing + # is enabled. + # + # @return [Array] + # + HTML_VOID_ELEMENTS = [ + 'area', + 'base', + 'br', + 'col', + 'command', + 'embed', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr' + ] + + # Lazy way of forwarding instance method calls used internally by Ragel to + # their corresponding class methods. + private_methods.grep(/^_lexer_/).each do |name| + define_method(name) do + return self.class.send(name) + end + + private(name) + end + + ## + # @param [Hash] options + # + # @option options [Symbol] :html When set to `true` the lexer will treat + # the input as HTML instead of SGML/XML. This makes it possible to lex + # HTML void elements such as ``. + # + def initialize(options = {}) + options.each do |key, value| + instance_variable_set("@#{key}", value) if respond_to?(key) + end + + reset + end + + ## + # Resets the internal state of the lexer. Typically you don't need to call + # this method yourself as its called by #lex after lexing a given String. + # + def reset + @line = 1 + @data = nil + @ts = nil + @te = nil + @tokens = [] + @stack = [] + @top = 0 + @elements = [] + + @buffer_start_position = nil + end + + ## + # Lexes the supplied String and returns an Array of tokens. Each token is + # an Array in the following format: + # + # [TYPE, VALUE] + # + # The type is a symbol, the value is either nil or a String. + # + # @param [String] data The string to lex. + # @return [Array] + # + def lex(data) + @data = data.unpack('U*') + lexer_start = self.class.lexer_start + eof = data.length + + +# line 441 "lib/oga/xml/lexer.rb" +begin + p ||= 0 + pe ||= @data.length + @cs = lexer_start + @top = 0 + @ts = nil + @te = nil + @act = 0 +end + +# line 104 "lib/oga/xml/lexer.rl" + +# line 454 "lib/oga/xml/lexer.rb" +begin + testEof = false + _slen, _trans, _keys, _inds, _acts, _nacts = nil + _goto_level = 0 + _resume = 10 + _eof_trans = 15 + _again = 20 + _test_eof = 30 + _out = 40 + while true + if _goto_level <= 0 + if p == pe + _goto_level = _test_eof + next + end + if @cs == 0 + _goto_level = _out + next + end + end + if _goto_level <= _resume + case _lexer_from_state_actions[ @cs] + when 14 then +# line 1 "NONE" + begin + @ts = p + end +# line 482 "lib/oga/xml/lexer.rb" + end + _keys = @cs << 1 + _inds = _lexer_index_offsets[ @cs] + _slen = _lexer_key_spans[ @cs] + _trans = if ( _slen > 0 && + _lexer_trans_keys[_keys] <= ( (@data[p] || 0)) && + ( (@data[p] || 0)) <= _lexer_trans_keys[_keys + 1] + ) then + _lexer_indicies[ _inds + ( (@data[p] || 0)) - _lexer_trans_keys[_keys] ] + else + _lexer_indicies[ _inds + _slen ] + end + end + if _goto_level <= _eof_trans + @cs = _lexer_trans_targs[_trans] + if _lexer_trans_actions[_trans] != 0 + case _lexer_trans_actions[_trans] + when 16 then +# line 1 "NONE" + begin + @te = p+1 + end + when 22 then +# line 254 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer(@ts, :T_STRING) + begin + @top -= 1 + @cs = @stack[ @top] + _goto_level = _again + next + end + + end + end + when 21 then +# line 259 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + end + when 24 then +# line 264 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer(@ts, :T_STRING) + begin + @top -= 1 + @cs = @stack[ @top] + _goto_level = _again + next + end + + end + end + when 23 then +# line 269 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + end + when 7 then +# line 293 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin t(:T_DOCTYPE_TYPE) end + end + when 26 then +# line 240 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + start_buffer + + begin + @stack[ @top] = @cs + @top+= 1 + @cs = 42 + _goto_level = _again + next + end + + end + end + when 27 then +# line 246 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + start_buffer + + begin + @stack[ @top] = @cs + @top+= 1 + @cs = 43 + _goto_level = _again + next + end + + end + end + when 25 then +# line 301 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + end + when 28 then +# line 303 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + add_token(:T_DOCTYPE_END) + begin + @top -= 1 + @cs = @stack[ @top] + _goto_level = _again + next + end + + end + end + when 9 then +# line 334 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer + add_token(:T_CDATA_END) + + begin + @top -= 1 + @cs = @stack[ @top] + _goto_level = _again + next + end + + end + end + when 29 then +# line 341 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + end + when 30 then +# line 341 "lib/oga/xml/lexer.rl" + begin + @te = p +p = p - 1; end + when 8 then +# line 341 "lib/oga/xml/lexer.rl" + begin + begin p = (( @te))-1; end + end + when 11 then +# line 370 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer + add_token(:T_COMMENT_END) + + begin + @top -= 1 + @cs = @stack[ @top] + _goto_level = _again + next + end + + end + end + when 31 then +# line 377 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + end + when 32 then +# line 377 "lib/oga/xml/lexer.rl" + begin + @te = p +p = p - 1; end + when 10 then +# line 377 "lib/oga/xml/lexer.rl" + begin + begin p = (( @te))-1; end + end + when 35 then +# line 398 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer + add_token(:T_XML_DECL_END) + + begin + @top -= 1 + @cs = @stack[ @top] + _goto_level = _again + next + end + + end + end + when 33 then +# line 405 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + end + when 34 then +# line 405 "lib/oga/xml/lexer.rl" + begin + @te = p +p = p - 1; end + when 36 then +# line 446 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + end + when 12 then +# line 448 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin advance_line end + end + when 37 then +# line 240 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + start_buffer + + begin + @stack[ @top] = @cs + @top+= 1 + @cs = 42 + _goto_level = _again + next + end + + end + end + when 38 then +# line 246 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + start_buffer + + begin + @stack[ @top] = @cs + @top+= 1 + @cs = 43 + _goto_level = _again + next + end + + end + end + when 39 then +# line 458 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + p = p - 1; + begin + @top -= 1 + @cs = @stack[ @top] + _goto_level = _again + next + end + + end + end + when 40 then +# line 451 "lib/oga/xml/lexer.rl" + begin + @te = p +p = p - 1; begin t(:T_ATTR) end + end + when 3 then +# line 284 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer + add_token(:T_DOCTYPE_START) + begin + @stack[ @top] = @cs + @top+= 1 + @cs = 44 + _goto_level = _again + next + end + + end + end + when 4 then +# line 322 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer + add_token(:T_CDATA_START) + + start_buffer + + begin + @stack[ @top] = @cs + @top+= 1 + @cs = 45 + _goto_level = _again + next + end + + end + end + when 2 then +# line 358 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer + add_token(:T_COMMENT_START) + + start_buffer + + begin + @stack[ @top] = @cs + @top+= 1 + @cs = 47 + _goto_level = _again + next + end + + end + end + when 6 then +# line 387 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer + add_token(:T_XML_DECL_START) + + start_buffer + + begin + @stack[ @top] = @cs + @top+= 1 + @cs = 49 + _goto_level = _again + next + end + + end + end + when 17 then +# line 473 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + if html? and HTML_VOID_ELEMENTS.include?(current_element) + add_token(:T_ELEM_END, nil) + @elements.pop + end + end + end + when 5 then +# line 481 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + emit_buffer + add_token(:T_ELEM_END, nil) + + @elements.pop + end + end + when 19 then +# line 489 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + add_token(:T_ELEM_END, nil) + + @elements.pop + end + end + when 15 then +# line 497 "lib/oga/xml/lexer.rl" + begin + @te = p+1 + begin + # First character, start buffering (unless we already are buffering). + start_buffer(@ts) unless buffering? + + # EOF, emit the text buffer. + if @te == eof + emit_buffer(@te) + end + end + end + when 20 then +# line 415 "lib/oga/xml/lexer.rl" + begin + @te = p +p = p - 1; begin + emit_buffer + add_token(:T_ELEM_START) + + # Add the element name. If the name includes a namespace we'll break + # the name up into two separate tokens. + name = text(@ts + 1) + + if name.include?(':') + ns, name = name.split(':') + + add_token(:T_ELEM_NS, ns) + end + + @elements << name + + add_token(:T_ELEM_NAME, name) + + begin + @stack[ @top] = @cs + @top+= 1 + @cs = 51 + _goto_level = _again + next + end + + end + end + when 18 then +# line 497 "lib/oga/xml/lexer.rl" + begin + @te = p +p = p - 1; begin + # First character, start buffering (unless we already are buffering). + start_buffer(@ts) unless buffering? + + # EOF, emit the text buffer. + if @te == eof + emit_buffer(@te) + end + end + end + when 1 then +# line 497 "lib/oga/xml/lexer.rl" + begin + begin p = (( @te))-1; end + begin + # First character, start buffering (unless we already are buffering). + start_buffer(@ts) unless buffering? + + # EOF, emit the text buffer. + if @te == eof + emit_buffer(@te) + end + end + end +# line 945 "lib/oga/xml/lexer.rb" + end + end + end + if _goto_level <= _again + case _lexer_to_state_actions[ @cs] + when 13 then +# line 1 "NONE" + begin + @ts = nil; end +# line 955 "lib/oga/xml/lexer.rb" + end + + if @cs == 0 + _goto_level = _out + next + end + p += 1 + if p != pe + _goto_level = _resume + next + end + end + if _goto_level <= _test_eof + if p == eof + if _lexer_eof_trans[ @cs] > 0 + _trans = _lexer_eof_trans[ @cs] - 1; + _goto_level = _eof_trans + next; + end + end + + end + if _goto_level <= _out + break + end +end + end + +# line 105 "lib/oga/xml/lexer.rl" + + tokens = @tokens + + reset + + return tokens + end + + ## + # @return [TrueClass|FalseClass] + # + def html? + return !!html + end + + private + + ## + # @param [Fixnum] amount The amount of lines to advance. + # + def advance_line(amount = 1) + @line += amount + end + + ## + # Emits a token who's value is based on the supplied start/stop position. + # + # @param [Symbol] type The token type. + # @param [Fixnum] start + # @param [Fixnum] stop + # + # @see #text + # @see #add_token + # + def t(type, start = @ts, stop = @te) + value = text(start, stop) + + add_token(type, value) + end + + ## + # Returns the text of the current buffer based on the supplied start and + # stop position. + # + # By default `@ts` and `@te` are used as the start/stop position. + # + # @param [Fixnum] start + # @param [Fixnum] stop + # @return [String] + # + def text(start = @ts, stop = @te) + return @data[start...stop].pack('U*') + end + + ## + # Adds a token with the given type and value to the list. + # + # @param [Symbol] type The token type. + # @param [String] value The token value. + # + def add_token(type, value = nil) + token = [type, value, @line] + + @tokens << token + end + + ## + # Enables buffering starting at the given position. + # + # @param [Fixnum] position The start position of the buffer, set to `@te` + # by default. + # + def start_buffer(position = @te) + @buffer_start_position = position + end + + ## + # Returns `true` if we're currently buffering. + # + # @return [TrueClass|FalseClass] + # + def buffering? + return !!@buffer_start_position + end + + ## + # Emits the current buffer if we have any. The current line number is + # advanced based on the amount of newlines in the buffer. + # + # @param [Fixnum] position The end position of the buffer, set to `@ts` by + # default. + # + # @param [Symbol] type The type of node to emit. + # + def emit_buffer(position = @ts, type = :T_TEXT) + return unless @buffer_start_position + + content = text(@buffer_start_position, position) + + unless content.empty? + add_token(type, content) + + lines = content.count("\n") + + advance_line(lines) if lines > 0 + end + + @buffer_start_position = nil + end + + ## + # Returns the name of the element we're currently in. + # + # @return [String] + # + def current_element + return @elements.last + end + + +# line 507 "lib/oga/xml/lexer.rl" + + end # Lexer + end # XML +end # Oga diff --git a/lib/oga/xml/lexer.rl b/lib/oga/xml/lexer.rl new file mode 100644 index 0000000..2da2955 --- /dev/null +++ b/lib/oga/xml/lexer.rl @@ -0,0 +1,510 @@ +%%machine lexer; # % + +module Oga + module XML + ## + # Low level lexer that supports both XML and HTML (using an extra option). To + # lex HTML input set the `:html` option to `true` when creating an instance + # of the lexer: + # + # lexer = Oga::Lexer.new(:html => true) + # + # @!attribute [r] html + # @return [TrueClass|FalseClass] + # + class Lexer + %% write data; # % + + attr_reader :html + + ## + # Names of the HTML void elements that should be handled when HTML lexing + # is enabled. + # + # @return [Array] + # + HTML_VOID_ELEMENTS = [ + 'area', + 'base', + 'br', + 'col', + 'command', + 'embed', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr' + ] + + # Lazy way of forwarding instance method calls used internally by Ragel to + # their corresponding class methods. + private_methods.grep(/^_lexer_/).each do |name| + define_method(name) do + return self.class.send(name) + end + + private(name) + end + + ## + # @param [Hash] options + # + # @option options [Symbol] :html When set to `true` the lexer will treat + # the input as HTML instead of SGML/XML. This makes it possible to lex + # HTML void elements such as ``. + # + def initialize(options = {}) + options.each do |key, value| + instance_variable_set("@#{key}", value) if respond_to?(key) + end + + reset + end + + ## + # Resets the internal state of the lexer. Typically you don't need to call + # this method yourself as its called by #lex after lexing a given String. + # + def reset + @line = 1 + @data = nil + @ts = nil + @te = nil + @tokens = [] + @stack = [] + @top = 0 + @elements = [] + + @buffer_start_position = nil + end + + ## + # Lexes the supplied String and returns an Array of tokens. Each token is + # an Array in the following format: + # + # [TYPE, VALUE] + # + # The type is a symbol, the value is either nil or a String. + # + # @param [String] data The string to lex. + # @return [Array] + # + def lex(data) + @data = data.unpack('U*') + lexer_start = self.class.lexer_start + eof = data.length + + %% write init; + %% write exec; + + tokens = @tokens + + reset + + return tokens + end + + ## + # @return [TrueClass|FalseClass] + # + def html? + return !!html + end + + private + + ## + # @param [Fixnum] amount The amount of lines to advance. + # + def advance_line(amount = 1) + @line += amount + end + + ## + # Emits a token who's value is based on the supplied start/stop position. + # + # @param [Symbol] type The token type. + # @param [Fixnum] start + # @param [Fixnum] stop + # + # @see #text + # @see #add_token + # + def t(type, start = @ts, stop = @te) + value = text(start, stop) + + add_token(type, value) + end + + ## + # Returns the text of the current buffer based on the supplied start and + # stop position. + # + # By default `@ts` and `@te` are used as the start/stop position. + # + # @param [Fixnum] start + # @param [Fixnum] stop + # @return [String] + # + def text(start = @ts, stop = @te) + return @data[start...stop].pack('U*') + end + + ## + # Adds a token with the given type and value to the list. + # + # @param [Symbol] type The token type. + # @param [String] value The token value. + # + def add_token(type, value = nil) + token = [type, value, @line] + + @tokens << token + end + + ## + # Enables buffering starting at the given position. + # + # @param [Fixnum] position The start position of the buffer, set to `@te` + # by default. + # + def start_buffer(position = @te) + @buffer_start_position = position + end + + ## + # Returns `true` if we're currently buffering. + # + # @return [TrueClass|FalseClass] + # + def buffering? + return !!@buffer_start_position + end + + ## + # Emits the current buffer if we have any. The current line number is + # advanced based on the amount of newlines in the buffer. + # + # @param [Fixnum] position The end position of the buffer, set to `@ts` by + # default. + # + # @param [Symbol] type The type of node to emit. + # + def emit_buffer(position = @ts, type = :T_TEXT) + return unless @buffer_start_position + + content = text(@buffer_start_position, position) + + unless content.empty? + add_token(type, content) + + lines = content.count("\n") + + advance_line(lines) if lines > 0 + end + + @buffer_start_position = nil + end + + ## + # Returns the name of the element we're currently in. + # + # @return [String] + # + def current_element + return @elements.last + end + + %%{ + # Use instance variables for `ts` and friends. + access @; + getkey (@data[p] || 0); + + newline = '\n' | '\r\n'; + whitespace = [ \t]; + + # Strings + # + # Strings in HTML can either be single or double quoted. If a string + # starts with one of these quotes it must be closed with the same type of + # quote. + dquote = '"'; + squote = "'"; + + action start_string_dquote { + start_buffer + + fcall string_dquote; + } + + action start_string_squote { + start_buffer + + fcall string_squote; + } + + # Machine for processing double quoted strings. + string_dquote := |* + dquote => { + emit_buffer(@ts, :T_STRING) + fret; + }; + + any; + *|; + + # Machine for processing single quoted strings. + string_squote := |* + squote => { + emit_buffer(@ts, :T_STRING) + fret; + }; + + any; + *|; + + # DOCTYPES + # + # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax + # + # These rules support the 3 flavours of doctypes: + # + # 1. Normal doctypes, as introduced in the HTML5 specification. + # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. + # 3. Legacy doctypes + # + doctype_start = ' { t(:T_DOCTYPE_TYPE) }; + + # Lex the public/system IDs as regular strings. + dquote => start_string_dquote; + squote => start_string_squote; + + # Whitespace inside doctypes is ignored since there's no point in + # including it. + whitespace; + + '>' => { + add_token(:T_DOCTYPE_END) + fret; + }; + *|; + + # CDATA + # + # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections + # + # CDATA tags are broken up into 3 parts: the start, the content and the + # end tag. + # + # In HTML CDATA tags have no meaning/are not supported. Oga does + # support them but treats their contents as plain text. + # + cdata_start = ''; + + action start_cdata { + emit_buffer + add_token(:T_CDATA_START) + + start_buffer + + fcall cdata; + } + + # Machine that for processing the contents of CDATA tags. Everything + # inside a CDATA tag is treated as plain text. + cdata := |* + cdata_end => { + emit_buffer + add_token(:T_CDATA_END) + + fret; + }; + + any; + *|; + + # Comments + # + # http://www.w3.org/TR/html-markup/syntax.html#comments + # + # Comments are lexed into 3 parts: the start tag, the content and the end + # tag. + # + # Unlike the W3 specification these rules *do* allow character sequences + # such as `--` and `->`. Putting extra checks in for these sequences + # would actually make the rules/actions more complex. + # + comment_start = ''; + + action start_comment { + emit_buffer + add_token(:T_COMMENT_START) + + start_buffer + + fcall comment; + } + + # Machine used for processing the contents of a comment. Everything + # inside a comment is treated as plain text (similar to CDATA tags). + comment := |* + comment_end => { + emit_buffer + add_token(:T_COMMENT_END) + + fret; + }; + + any; + *|; + + # XML declaration tags + # + # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd + # + xml_decl_start = ''; + + action start_xml_decl { + emit_buffer + add_token(:T_XML_DECL_START) + + start_buffer + + fcall xml_decl; + } + + # Machine that processes the contents of an XML declaration tag. + xml_decl := |* + xml_decl_end => { + emit_buffer + add_token(:T_XML_DECL_END) + + fret; + }; + + any; + *|; + + # Elements + # + # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements + # + + # Action that creates the tokens for the opening tag, name and namespace + # (if any). Remaining work is delegated to a dedicated machine. + action start_element { + emit_buffer + add_token(:T_ELEM_START) + + # Add the element name. If the name includes a namespace we'll break + # the name up into two separate tokens. + name = text(@ts + 1) + + if name.include?(':') + ns, name = name.split(':') + + add_token(:T_ELEM_NS, ns) + end + + @elements << name + + add_token(:T_ELEM_NAME, name) + + fcall element_head; + } + + element_name = [a-zA-Z0-9\-_:]+; + element_start = '<' element_name; + + # Machine used for processing the characters inside a element head. An + # element head is everything between ``. + # + # For example, in `

` the element head is ` foo="bar"`. + # + element_head := |* + whitespace | '='; + + newline => { advance_line }; + + # Attribute names. + element_name => { t(:T_ATTR) }; + + # Attribute values. + dquote => start_string_dquote; + squote => start_string_squote; + + # The closing character of the open tag. + ('>' | '/') => { + fhold; + fret; + }; + *|; + + main := |* + element_start => start_element; + doctype_start => start_doctype; + cdata_start => start_cdata; + comment_start => start_comment; + xml_decl_start => start_xml_decl; + + # Enter the body of the tag. If HTML mode is enabled and the current + # element is a void element we'll close it and bail out. + '>' => { + if html? and HTML_VOID_ELEMENTS.include?(current_element) + add_token(:T_ELEM_END, nil) + @elements.pop + end + }; + + # Regular closing tags. + '' => { + emit_buffer + add_token(:T_ELEM_END, nil) + + @elements.pop + }; + + # Self closing elements that are not handled by the HTML mode. + '/>' => { + add_token(:T_ELEM_END, nil) + + @elements.pop + }; + + # Note that this rule should be declared at the very bottom as it will + # otherwise take precedence over the other rules. + any => { + # First character, start buffering (unless we already are buffering). + start_buffer(@ts) unless buffering? + + # EOF, emit the text buffer. + if @te == eof + emit_buffer(@te) + end + }; + *|; + }%% + end # Lexer + end # XML +end # Oga diff --git a/lib/oga/xml/parser.rb b/lib/oga/xml/parser.rb new file mode 100644 index 0000000..784d878 --- /dev/null +++ b/lib/oga/xml/parser.rb @@ -0,0 +1,402 @@ +# +# DO NOT MODIFY!!!! +# This file is automatically generated by Racc 1.4.11 +# from Racc grammer file "". +# + +require 'racc/parser.rb' +module Oga + module XML + class Parser < Racc::Parser + + ## + # @param [Hash] options + # + # @option options [TrueClass|FalseClass] :html Enables HTML parsing mode. + # @see Oga::Lexer#initialize + # + def initialize(options = {}) + @lexer = Lexer.new(options) + end + + ## + # Resets the internal state of the parser. + # + def reset + @lines = [] + @line = 1 + end + + ## + # Emits a new AST token. + # + # @param [Symbol] type + # @param [Array] children + # + def s(type, *children) + return AST::Node.new( + type, + children.flatten, + :line => @line + ) + end + + ## + # Returns the next token from the lexer. + # + # @return [Array] + # + def next_token + type, value, line = @tokens.shift + + @line = line if line + + return type ? [type, value] : [false, false] + end + + ## + # @param [Fixnum] type The type of token the error occured on. + # @param [String] value The value of the token. + # @param [Array] stack The current stack of parsed nodes. + # @raise [Racc::ParseError] + # + def on_error(type, value, stack) + name = token_to_str(type) + index = @line - 1 + lines = '' + + # Show up to 5 lines before and after the offending line (if they exist). + (-5..5).each do |offset| + line = @lines[index + offset] + number = @line + offset + + if line and number > 0 + if offset == 0 + prefix = '=> ' + else + prefix = ' ' + end + + lines << "#{prefix}#{number}: #{line.strip}\n" + end + end + + raise Racc::ParseError, <<-EOF +Unexpected #{name} with value #{value.inspect} on line #{@line}: + +#{lines} + EOF + end + + ## + # Parses the supplied string and returns the AST. + # + # @example + # parser = Oga::Parser.new + # ast = parser.parse('bar') + # + # @param [String] string + # @return [Oga::AST::Node] + # + def parse(string) + @lines = string.lines + @tokens = @lexer.lex(string) + ast = do_parse + + reset + + return ast + end + +# vim: set ft=racc: +##### State transition tables begin ### + +racc_action_table = [ + 16, 40, 16, 10, 24, 37, 11, 22, 12, 28, + 14, 23, 21, 45, 31, 15, 16, 10, 44, 28, + 11, 43, 12, 36, 14, 35, 16, 10, 34, 15, + 11, 41, 12, 42, 14, 33, 16, 10, 17, 15, + 11, 46, 12, nil, 14, 29, 30, 19, 20, 15 ] + +racc_action_check = [ + 15, 28, 38, 38, 12, 24, 38, 11, 38, 13, + 38, 12, 11, 38, 15, 38, 2, 2, 35, 26, + 2, 35, 2, 22, 2, 20, 25, 25, 20, 2, + 25, 30, 25, 32, 25, 17, 0, 0, 1, 25, + 0, 44, 0, nil, 0, 14, 14, 10, 10, 0 ] + +racc_action_pointer = [ + 33, 38, 13, nil, nil, nil, nil, nil, nil, nil, + 42, 4, 1, -6, 33, -3, nil, 35, nil, nil, + 23, nil, 15, nil, -5, 23, 4, nil, -1, nil, + 19, nil, 16, nil, nil, 16, nil, nil, -1, nil, + nil, nil, nil, nil, 36, nil, nil ] + +racc_action_default = [ + -2, -32, -1, -4, -6, -7, -8, -9, -10, -11, + -32, -32, -32, -24, -32, -32, -31, -32, -3, -12, + -32, -16, -32, -18, -32, -5, -23, -26, -27, -21, + -32, -29, -32, 47, -13, -32, -17, -19, -32, -25, + -28, -22, -30, -14, -32, -20, -15 ] + +racc_goto_table = [ + 18, 2, 27, 32, 25, 26, 1, nil, nil, nil, + nil, nil, nil, nil, nil, 39, nil, nil, nil, nil, + nil, nil, nil, nil, nil, nil, 38, nil, nil, nil, + nil, nil, nil, nil, nil, nil, 18 ] + +racc_goto_check = [ + 3, 2, 13, 8, 11, 12, 1, nil, nil, nil, + nil, nil, nil, nil, nil, 13, nil, nil, nil, nil, + nil, nil, nil, nil, nil, nil, 2, nil, nil, nil, + nil, nil, nil, nil, nil, nil, 3 ] + +racc_goto_pointer = [ + nil, 6, 1, -2, nil, nil, nil, nil, -12, nil, + nil, -9, -8, -11 ] + +racc_goto_default = [ + nil, nil, nil, 3, 4, 5, 6, 7, 8, 9, + 13, nil, nil, nil ] + +racc_reduce_table = [ + 0, 0, :racc_error, + 1, 19, :_reduce_1, + 0, 19, :_reduce_2, + 2, 20, :_reduce_3, + 1, 20, :_reduce_4, + 0, 20, :_reduce_5, + 1, 21, :_reduce_none, + 1, 21, :_reduce_none, + 1, 21, :_reduce_none, + 1, 21, :_reduce_none, + 1, 21, :_reduce_none, + 1, 21, :_reduce_none, + 2, 22, :_reduce_12, + 3, 22, :_reduce_13, + 4, 22, :_reduce_14, + 5, 22, :_reduce_15, + 2, 23, :_reduce_16, + 3, 23, :_reduce_17, + 2, 24, :_reduce_18, + 3, 24, :_reduce_19, + 4, 25, :_reduce_20, + 2, 28, :_reduce_21, + 3, 28, :_reduce_22, + 1, 29, :_reduce_23, + 0, 29, :_reduce_24, + 2, 30, :_reduce_25, + 1, 30, :_reduce_26, + 1, 31, :_reduce_27, + 2, 31, :_reduce_28, + 2, 27, :_reduce_29, + 3, 27, :_reduce_30, + 1, 26, :_reduce_31 ] + +racc_reduce_n = 32 + +racc_shift_n = 47 + +racc_token_table = { + false => 0, + :error => 1, + :T_STRING => 2, + :T_TEXT => 3, + :T_DOCTYPE_START => 4, + :T_DOCTYPE_END => 5, + :T_DOCTYPE_TYPE => 6, + :T_CDATA_START => 7, + :T_CDATA_END => 8, + :T_COMMENT_START => 9, + :T_COMMENT_END => 10, + :T_ELEM_START => 11, + :T_ELEM_NAME => 12, + :T_ELEM_NS => 13, + :T_ELEM_END => 14, + :T_ATTR => 15, + :T_XML_DECL_START => 16, + :T_XML_DECL_END => 17 } + +racc_nt_base = 18 + +racc_use_result_var = false + +Racc_arg = [ + racc_action_table, + racc_action_check, + racc_action_default, + racc_action_pointer, + racc_goto_table, + racc_goto_check, + racc_goto_default, + racc_goto_pointer, + racc_nt_base, + racc_reduce_table, + racc_token_table, + racc_shift_n, + racc_reduce_n, + racc_use_result_var ] + +Racc_token_to_s_table = [ + "$end", + "error", + "T_STRING", + "T_TEXT", + "T_DOCTYPE_START", + "T_DOCTYPE_END", + "T_DOCTYPE_TYPE", + "T_CDATA_START", + "T_CDATA_END", + "T_COMMENT_START", + "T_COMMENT_END", + "T_ELEM_START", + "T_ELEM_NAME", + "T_ELEM_NS", + "T_ELEM_END", + "T_ATTR", + "T_XML_DECL_START", + "T_XML_DECL_END", + "$start", + "document", + "expressions", + "expression", + "doctype", + "cdata", + "comment", + "element", + "text", + "xmldecl", + "element_open", + "attributes", + "attributes_", + "attribute" ] + +Racc_debug_parser = false + +##### State transition tables end ##### + +# reduce 0 omitted + +def _reduce_1(val, _values) + s(:document, val[0]) +end + +def _reduce_2(val, _values) + s(:document) +end + +def _reduce_3(val, _values) + val.compact +end + +def _reduce_4(val, _values) + val[0] +end + +def _reduce_5(val, _values) + nil +end + +# reduce 6 omitted + +# reduce 7 omitted + +# reduce 8 omitted + +# reduce 9 omitted + +# reduce 10 omitted + +# reduce 11 omitted + +def _reduce_12(val, _values) + s(:doctype) +end + +def _reduce_13(val, _values) + s(:doctype, val[1]) + +end + +def _reduce_14(val, _values) + s(:doctype, val[1], val[2]) + +end + +def _reduce_15(val, _values) + s(:doctype, val[1], val[2], val[3]) + +end + +def _reduce_16(val, _values) + s(:cdata) +end + +def _reduce_17(val, _values) + s(:cdata, val[1]) +end + +def _reduce_18(val, _values) + s(:comment) +end + +def _reduce_19(val, _values) + s(:comment, val[1]) +end + +def _reduce_20(val, _values) + s(:element, val[0], val[1], val[2]) + +end + +def _reduce_21(val, _values) + [nil, val[1]] +end + +def _reduce_22(val, _values) + [val[1], val[2]] +end + +def _reduce_23(val, _values) + s(:attributes, val[0]) +end + +def _reduce_24(val, _values) + nil +end + +def _reduce_25(val, _values) + val +end + +def _reduce_26(val, _values) + val +end + +def _reduce_27(val, _values) + s(:attribute, val[0]) +end + +def _reduce_28(val, _values) + s(:attribute, val[0], val[1]) +end + +def _reduce_29(val, _values) + s(:xml_decl) +end + +def _reduce_30(val, _values) + s(:xml_decl, val[1]) +end + +def _reduce_31(val, _values) + s(:text, val[0]) +end + +def _reduce_none(val, _values) + val[0] +end + + end # class Parser + end # module XML + end # module Oga diff --git a/lib/oga/parser.y b/lib/oga/xml/parser.y similarity index 98% rename from lib/oga/parser.y rename to lib/oga/xml/parser.y index 274499d..ac90cc8 100644 --- a/lib/oga/parser.y +++ b/lib/oga/xml/parser.y @@ -5,9 +5,9 @@ # It requires every tag to have a closing tag. As such you'll need to enable # HTML parsing mode when parsing HTML. This can be done as following: # -# parser = Oga::Parser.new(:html => true) +# parser = Oga::XML::Parser.new(:html => true) # -class Oga::Parser +class Oga::XML::Parser token T_STRING T_TEXT token T_DOCTYPE_START T_DOCTYPE_END T_DOCTYPE_TYPE diff --git a/spec/oga/lexer/cdata_spec.rb b/spec/oga/lexer/cdata_spec.rb index f641a4f..f4465dd 100644 --- a/spec/oga/lexer/cdata_spec.rb +++ b/spec/oga/lexer/cdata_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Lexer do +describe Oga::XML::Lexer do context 'cdata tags' do example 'lex a cdata tag' do lex('').should == [ diff --git a/spec/oga/lexer/comments_spec.rb b/spec/oga/lexer/comments_spec.rb index 68f5985..25b0bd5 100644 --- a/spec/oga/lexer/comments_spec.rb +++ b/spec/oga/lexer/comments_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Lexer do +describe Oga::XML::Lexer do context 'comments' do example 'lex a comment' do lex('').should == [ diff --git a/spec/oga/lexer/doctype_spec.rb b/spec/oga/lexer/doctype_spec.rb index 757d9a8..666743d 100644 --- a/spec/oga/lexer/doctype_spec.rb +++ b/spec/oga/lexer/doctype_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Lexer do +describe Oga::XML::Lexer do context 'doctypes' do example 'lex the HTML5 doctype' do lex('').should == [ diff --git a/spec/oga/lexer/documents_spec.rb b/spec/oga/lexer/documents_spec.rb index 83d63c0..9c13751 100644 --- a/spec/oga/lexer/documents_spec.rb +++ b/spec/oga/lexer/documents_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Lexer do +describe Oga::XML::Lexer do context 'HTML documents' do example 'lex a basic HTML document' do html = <<-EOF diff --git a/spec/oga/lexer/elements_spec.rb b/spec/oga/lexer/elements_spec.rb index 710df16..9de96e1 100644 --- a/spec/oga/lexer/elements_spec.rb +++ b/spec/oga/lexer/elements_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Lexer do +describe Oga::XML::Lexer do context 'elements' do example 'lex an opening element' do lex('

').should == [ diff --git a/spec/oga/lexer/general_spec.rb b/spec/oga/lexer/general_spec.rb index 0c8cf87..93f35a4 100644 --- a/spec/oga/lexer/general_spec.rb +++ b/spec/oga/lexer/general_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Lexer do +describe Oga::XML::Lexer do context 'regular text' do example 'lex regular text' do lex('hello').should == [[:T_TEXT, 'hello', 1]] diff --git a/spec/oga/lexer/html_void_elements_spec.rb b/spec/oga/lexer/html_void_elements_spec.rb index b80fc8e..aac509f 100644 --- a/spec/oga/lexer/html_void_elements_spec.rb +++ b/spec/oga/lexer/html_void_elements_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Lexer do +describe Oga::XML::Lexer do context 'HTML void elements' do example 'lex a void element that omits the closing /' do lex('', :html => true).should == [ diff --git a/spec/oga/lexer/xml_declaration_spec.rb b/spec/oga/lexer/xml_declaration_spec.rb index 041c2d2..eb3de1f 100644 --- a/spec/oga/lexer/xml_declaration_spec.rb +++ b/spec/oga/lexer/xml_declaration_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Lexer do +describe Oga::XML::Lexer do context 'XML declaration tags' do example 'lex a start tag' do lex('').should == s(:document, s(:cdata, 'foo')) diff --git a/spec/oga/parser/comments_spec.rb b/spec/oga/parser/comments_spec.rb index 1c87c6d..6aa12cc 100644 --- a/spec/oga/parser/comments_spec.rb +++ b/spec/oga/parser/comments_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Parser do +describe Oga::XML::Parser do context 'comments' do example 'parse an empty comment' do parse('').should == s(:document, s(:comment)) diff --git a/spec/oga/parser/doctype_spec.rb b/spec/oga/parser/doctype_spec.rb index 301b7e2..015f30e 100644 --- a/spec/oga/parser/doctype_spec.rb +++ b/spec/oga/parser/doctype_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Parser do +describe Oga::XML::Parser do context 'doctypes' do example 'parse a doctype' do parse('').should == s(:document, s(:doctype)) diff --git a/spec/oga/parser/documents_spec.rb b/spec/oga/parser/documents_spec.rb index 204f8d7..b5c7192 100644 --- a/spec/oga/parser/documents_spec.rb +++ b/spec/oga/parser/documents_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Parser do +describe Oga::XML::Parser do context 'HTML documents' do example 'parse a basic HTML document' do html = <<-EOF diff --git a/spec/oga/parser/elements_spec.rb b/spec/oga/parser/elements_spec.rb index 9b71b5d..c60f561 100644 --- a/spec/oga/parser/elements_spec.rb +++ b/spec/oga/parser/elements_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Parser do +describe Oga::XML::Parser do context 'elements' do example 'parse an empty element' do parse('

').should == s( diff --git a/spec/oga/parser/general_spec.rb b/spec/oga/parser/general_spec.rb index 94bd0d1..cfc0b76 100644 --- a/spec/oga/parser/general_spec.rb +++ b/spec/oga/parser/general_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Parser do +describe Oga::XML::Parser do example 'parse regular text' do parse('foo').should == s(:document, s(:text, 'foo')) end diff --git a/spec/oga/parser/html_void_elements_spec.rb b/spec/oga/parser/html_void_elements_spec.rb index 74d135e..29e481e 100644 --- a/spec/oga/parser/html_void_elements_spec.rb +++ b/spec/oga/parser/html_void_elements_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Parser do +describe Oga::XML::Parser do context 'HTML void elements' do example 'parse a void element that omits the closing /' do parse('', :html => true).should == s( diff --git a/spec/oga/parser/xml_declaration_spec.rb b/spec/oga/parser/xml_declaration_spec.rb index e0df820..3096a5d 100644 --- a/spec/oga/parser/xml_declaration_spec.rb +++ b/spec/oga/parser/xml_declaration_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe Oga::Parser do +describe Oga::XML::Parser do context 'XML declaration tags' do example 'lex an XML declaration tag' do parse('').should == s( diff --git a/spec/support/parsing.rb b/spec/support/parsing.rb index 8bb26c9..37b32ac 100644 --- a/spec/support/parsing.rb +++ b/spec/support/parsing.rb @@ -19,7 +19,7 @@ module Oga # @return [Array] # def lex(input, options = {}) - return Oga::Lexer.new(options).lex(input) + return Oga::XML::Lexer.new(options).lex(input) end ## @@ -30,7 +30,7 @@ module Oga # @return [Oga::AST::Node] # def parse(input, options = {}) - return Oga::Parser.new(options).parse(input) + return Oga::XML::Parser.new(options).parse(input) end end # ParsingHelpers end # Oga