Use separate Ragel machines for script/style tags
Previously a single Ragel machine was used for processing HTML
script and style tags. This had the unfortunate side-effect that the
following was not parsed correctly (while being valid HTML):
    <script>
    var foo = "</style>";
    </script>
The same applied to style tags:
    <style>
    /* </script> */
    </style>
By using separate machines we can work around the above issue. The
downside is that this can produce multiple T_TEXT nodes, which have to
be stitched back together in the parser.
			
			
This commit is contained in:
		
							parent
							
								
									2d43e459a1
								
							
						
					
					
						commit
						73fbbfbdbd
					
				| 
						 | 
				
			
			@ -19,11 +19,15 @@ on `ts` and `te`) so the macro ignores this argument.
 | 
			
		|||
#define advance_line(amount) \
 | 
			
		||||
    rb_funcall(self, id_advance_line, 1, INT2NUM(amount));
 | 
			
		||||
 | 
			
		||||
#define literal_html_element_p() \
 | 
			
		||||
    rb_funcall(self, id_literal_html_element_p, 0) == Qtrue
 | 
			
		||||
#define html_script_p() \
 | 
			
		||||
    rb_funcall(self, id_html_script_p, 0) == Qtrue
 | 
			
		||||
 | 
			
		||||
#define html_style_p() \
 | 
			
		||||
    rb_funcall(self, id_html_style_p, 0) == Qtrue
 | 
			
		||||
 | 
			
		||||
ID id_advance_line;
 | 
			
		||||
ID id_literal_html_element_p;
 | 
			
		||||
ID id_html_script_p;
 | 
			
		||||
ID id_html_style_p;
 | 
			
		||||
ID id_html;
 | 
			
		||||
 | 
			
		||||
%%machine c_lexer;
 | 
			
		||||
| 
						 | 
				
			
			@ -183,9 +187,10 @@ void Init_liboga_xml_lexer()
 | 
			
		|||
    VALUE mXML   = rb_const_get(mOga, rb_intern("XML"));
 | 
			
		||||
    VALUE cLexer = rb_define_class_under(mXML, "Lexer", rb_cObject);
 | 
			
		||||
 | 
			
		||||
    id_advance_line           = rb_intern("advance_line");
 | 
			
		||||
    id_literal_html_element_p = rb_intern("literal_html_element?");
 | 
			
		||||
    id_html                   = rb_intern("html");
 | 
			
		||||
    id_advance_line  = rb_intern("advance_line");
 | 
			
		||||
    id_html_script_p = rb_intern("html_script?");
 | 
			
		||||
    id_html_style_p  = rb_intern("html_style?");
 | 
			
		||||
    id_html          = rb_intern("html");
 | 
			
		||||
 | 
			
		||||
    rb_define_method(cLexer, "advance_native", oga_xml_lexer_advance, 1);
 | 
			
		||||
    rb_define_method(cLexer, "reset_native", oga_xml_lexer_reset, 0);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -194,13 +194,23 @@ public class Lexer extends RubyObject
 | 
			
		|||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * See * Oga::XML::Lexer#literal_html_element? for more information.
 | 
			
		||||
     * @see Oga::XML::Lexer#html_script?
 | 
			
		||||
     */
 | 
			
		||||
    public Boolean literal_html_element_p()
 | 
			
		||||
    public Boolean html_script_p()
 | 
			
		||||
    {
 | 
			
		||||
        ThreadContext context = this.runtime.getCurrentContext();
 | 
			
		||||
 | 
			
		||||
        return this.callMethod(context, "literal_html_element?").isTrue();
 | 
			
		||||
        return this.callMethod(context, "html_script?").isTrue();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * @see Oga::XML::Lexer#html_style?
 | 
			
		||||
     */
 | 
			
		||||
    public Boolean html_style_p()
 | 
			
		||||
    {
 | 
			
		||||
        ThreadContext context = this.runtime.getCurrentContext();
 | 
			
		||||
 | 
			
		||||
        return this.callMethod(context, "html_style?").isTrue();
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -58,7 +58,7 @@
 | 
			
		|||
    }
 | 
			
		||||
 | 
			
		||||
    action advance_newline {
 | 
			
		||||
        advance_line(1)
 | 
			
		||||
        advance_line(1);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    action hold_and_return {
 | 
			
		||||
| 
						 | 
				
			
			@ -376,6 +376,12 @@
 | 
			
		|||
        callback_simple(id_on_element_end);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    action close_element_fnext_main {
 | 
			
		||||
        callback_simple(id_on_element_end);
 | 
			
		||||
 | 
			
		||||
        fnext main;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # Machine used for lexing the name/namespace of an element.
 | 
			
		||||
    element_name := |*
 | 
			
		||||
        identifier ':' => {
 | 
			
		||||
| 
						 | 
				
			
			@ -465,9 +471,13 @@
 | 
			
		|||
        '>' => {
 | 
			
		||||
            callback_simple(id_on_element_open_end);
 | 
			
		||||
 | 
			
		||||
            if ( literal_html_element_p() )
 | 
			
		||||
            if ( html_script_p() )
 | 
			
		||||
            {
 | 
			
		||||
                fnext literal_html_element;
 | 
			
		||||
                fnext html_script;
 | 
			
		||||
            }
 | 
			
		||||
            else if ( html_style_p() )
 | 
			
		||||
            {
 | 
			
		||||
                fnext html_style;
 | 
			
		||||
            }
 | 
			
		||||
            else
 | 
			
		||||
            {
 | 
			
		||||
| 
						 | 
				
			
			@ -506,6 +516,17 @@
 | 
			
		|||
    terminate_text = '</' | '<!' | '<?' | element_start;
 | 
			
		||||
    allowed_text   = (any* -- terminate_text) $count_newlines;
 | 
			
		||||
 | 
			
		||||
    action emit_text {
 | 
			
		||||
        callback(id_on_text, data, encoding, ts, te);
 | 
			
		||||
 | 
			
		||||
        if ( lines > 0 )
 | 
			
		||||
        {
 | 
			
		||||
            advance_line(lines);
 | 
			
		||||
 | 
			
		||||
            lines = 0;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    text := |*
 | 
			
		||||
        terminate_text | allowed_text => {
 | 
			
		||||
            callback(id_on_text, data, encoding, ts, te);
 | 
			
		||||
| 
						 | 
				
			
			@ -541,36 +562,17 @@
 | 
			
		|||
    # Certain tags in HTML can contain basically anything except for the literal
 | 
			
		||||
    # closing tag. Two examples are script and style tags.  As a result of this
 | 
			
		||||
    # we can't use the regular text machine.
 | 
			
		||||
    literal_html_closing_tags = '</script>' | '</style>';
 | 
			
		||||
    literal_html_allowed = (any* -- literal_html_closing_tags) $count_newlines;
 | 
			
		||||
 | 
			
		||||
    literal_html_element := |*
 | 
			
		||||
        literal_html_allowed => {
 | 
			
		||||
            callback(id_on_text, data, encoding, ts, te);
 | 
			
		||||
    literal_html_allowed = (^'<'+ | '<'+) $count_newlines;
 | 
			
		||||
 | 
			
		||||
            if ( lines > 0 )
 | 
			
		||||
            {
 | 
			
		||||
                advance_line(lines);
 | 
			
		||||
    html_script := |*
 | 
			
		||||
        literal_html_allowed => emit_text;
 | 
			
		||||
        '</script>'          => close_element_fnext_main;
 | 
			
		||||
    *|;
 | 
			
		||||
 | 
			
		||||
                lines = 0;
 | 
			
		||||
            }
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        literal_html_allowed %{ mark = p; } literal_html_closing_tags => {
 | 
			
		||||
            callback(id_on_text, data, encoding, ts, mark);
 | 
			
		||||
 | 
			
		||||
            p    = mark - 1;
 | 
			
		||||
            mark = 0;
 | 
			
		||||
 | 
			
		||||
            if ( lines > 0 )
 | 
			
		||||
            {
 | 
			
		||||
                advance_line(lines);
 | 
			
		||||
 | 
			
		||||
                lines = 0;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            fnext main;
 | 
			
		||||
        };
 | 
			
		||||
    html_style := |*
 | 
			
		||||
        literal_html_allowed => emit_text;
 | 
			
		||||
        '</style>'           => close_element_fnext_main;
 | 
			
		||||
    *|;
 | 
			
		||||
 | 
			
		||||
    # The main machine aka the entry point of Ragel.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -40,12 +40,18 @@ module Oga
 | 
			
		|||
    class Lexer
 | 
			
		||||
      attr_reader :html
 | 
			
		||||
 | 
			
		||||
      # @return [String]
 | 
			
		||||
      HTML_SCRIPT = 'script'
 | 
			
		||||
 | 
			
		||||
      # @return [String]
 | 
			
		||||
      HTML_STYLE = 'style'
 | 
			
		||||
 | 
			
		||||
      ##
 | 
			
		||||
      # Names of HTML tags of which the content should be lexed as-is.
 | 
			
		||||
      #
 | 
			
		||||
      # @return [Array]
 | 
			
		||||
      #
 | 
			
		||||
      LITERAL_HTML_ELEMENTS = %w{script style}
 | 
			
		||||
      LITERAL_HTML_ELEMENTS = [HTML_SCRIPT, HTML_STYLE]
 | 
			
		||||
 | 
			
		||||
      ##
 | 
			
		||||
      # @param [String|IO] data The data to lex. This can either be a String or
 | 
			
		||||
| 
						 | 
				
			
			@ -189,12 +195,17 @@ module Oga
 | 
			
		|||
      end
 | 
			
		||||
 | 
			
		||||
      ##
 | 
			
		||||
      # Returns true if the current element's content should be lexed as-is.
 | 
			
		||||
      #
 | 
			
		||||
      # @return [TrueClass|FalseClass]
 | 
			
		||||
      #
 | 
			
		||||
      def literal_html_element?
 | 
			
		||||
        return html? && LITERAL_HTML_ELEMENTS.include?(current_element)
 | 
			
		||||
      def html_script?
 | 
			
		||||
        return html? && current_element == HTML_SCRIPT
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      ##
 | 
			
		||||
      # @return [TrueClass|FalseClass]
 | 
			
		||||
      #
 | 
			
		||||
      def html_style?
 | 
			
		||||
        return html? && current_element == HTML_STYLE
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      ##
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -176,7 +176,17 @@ xml_decl
 | 
			
		|||
# Plain text
 | 
			
		||||
 | 
			
		||||
text
 | 
			
		||||
  = T_TEXT { on_text(val[0]) }
 | 
			
		||||
  = T_TEXT text_follow
 | 
			
		||||
    {
 | 
			
		||||
      text = val[1] ? val[0] + val[1] : val[0]
 | 
			
		||||
 | 
			
		||||
      on_text(text)
 | 
			
		||||
    }
 | 
			
		||||
  ;
 | 
			
		||||
 | 
			
		||||
text_follow
 | 
			
		||||
  = T_TEXT text_follow { val[1] ? val[0] + val[1] : val[0] }
 | 
			
		||||
  | _                  { nil }
 | 
			
		||||
  ;
 | 
			
		||||
 | 
			
		||||
# Strings
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,10 +3,24 @@ require 'spec_helper'
 | 
			
		|||
describe Oga::XML::Lexer do
 | 
			
		||||
  describe 'HTML script elements' do
 | 
			
		||||
    it 'treats the content of a script tag as plain text' do
 | 
			
		||||
      lex('<script>foo <bar</script>', :html => true).should == [
 | 
			
		||||
      lex_html('<script>foo <bar</script>').should == [
 | 
			
		||||
        [:T_ELEM_START, nil, 1],
 | 
			
		||||
        [:T_ELEM_NAME, 'script', 1],
 | 
			
		||||
        [:T_TEXT, 'foo <bar', 1],
 | 
			
		||||
        [:T_TEXT, 'foo ', 1],
 | 
			
		||||
        [:T_TEXT, '<', 1],
 | 
			
		||||
        [:T_TEXT, 'bar', 1],
 | 
			
		||||
        [:T_ELEM_END, nil, 1]
 | 
			
		||||
      ]
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    it 'treats style tags inside script tags as text' do
 | 
			
		||||
      lex_html('<script><style></style></script>').should == [
 | 
			
		||||
        [:T_ELEM_START, nil, 1],
 | 
			
		||||
        [:T_ELEM_NAME, 'script', 1],
 | 
			
		||||
        [:T_TEXT, '<', 1],
 | 
			
		||||
        [:T_TEXT, 'style>', 1],
 | 
			
		||||
        [:T_TEXT, '<', 1],
 | 
			
		||||
        [:T_TEXT, '/style>', 1],
 | 
			
		||||
        [:T_ELEM_END, nil, 1]
 | 
			
		||||
      ]
 | 
			
		||||
    end
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,7 +3,7 @@ require 'spec_helper'
 | 
			
		|||
describe Oga::XML::Lexer do
 | 
			
		||||
  describe 'HTML style elements' do
 | 
			
		||||
    it 'lexes an empty <style> tag' do
 | 
			
		||||
      lex('<style></style>', :html => true).should == [
 | 
			
		||||
      lex_html('<style></style>').should == [
 | 
			
		||||
        [:T_ELEM_START, nil, 1],
 | 
			
		||||
        [:T_ELEM_NAME, 'style', 1],
 | 
			
		||||
        [:T_ELEM_END, nil, 1]
 | 
			
		||||
| 
						 | 
				
			
			@ -11,16 +11,30 @@ describe Oga::XML::Lexer do
 | 
			
		|||
    end
 | 
			
		||||
 | 
			
		||||
    it 'treats the content of a style tag as plain text' do
 | 
			
		||||
      lex('<style>foo <bar</style>', :html => true).should == [
 | 
			
		||||
      lex_html('<style>foo <bar</style>').should == [
 | 
			
		||||
        [:T_ELEM_START, nil, 1],
 | 
			
		||||
        [:T_ELEM_NAME, 'style', 1],
 | 
			
		||||
        [:T_TEXT, 'foo <bar', 1],
 | 
			
		||||
        [:T_TEXT, 'foo ', 1],
 | 
			
		||||
        [:T_TEXT, '<', 1],
 | 
			
		||||
        [:T_TEXT, 'bar', 1],
 | 
			
		||||
        [:T_ELEM_END, nil, 1]
 | 
			
		||||
      ]
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    it 'treats script tags inside style tags as text' do
 | 
			
		||||
      lex_html('<style><script></script></style>').should == [
 | 
			
		||||
        [:T_ELEM_START, nil, 1],
 | 
			
		||||
        [:T_ELEM_NAME, 'style', 1],
 | 
			
		||||
        [:T_TEXT, '<', 1],
 | 
			
		||||
        [:T_TEXT, 'script>', 1],
 | 
			
		||||
        [:T_TEXT, '<', 1],
 | 
			
		||||
        [:T_TEXT, '/script>', 1],
 | 
			
		||||
        [:T_ELEM_END, nil, 1]
 | 
			
		||||
      ]
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    it 'lexes a multi-line <style> tag using a String as the input' do
 | 
			
		||||
      lex("<style>foo\nbar</style>", :html => true).should == [
 | 
			
		||||
      lex_html("<style>foo\nbar</style>").should == [
 | 
			
		||||
        [:T_ELEM_START, nil, 1],
 | 
			
		||||
        [:T_ELEM_NAME, 'style', 1],
 | 
			
		||||
        [:T_TEXT, "foo\nbar", 1],
 | 
			
		||||
| 
						 | 
				
			
			@ -29,9 +43,7 @@ describe Oga::XML::Lexer do
 | 
			
		|||
    end
 | 
			
		||||
 | 
			
		||||
    it 'lexes a multi-line <style> tag using an IO as the input' do
 | 
			
		||||
      io = StringIO.new("<style>foo\nbar</style>")
 | 
			
		||||
 | 
			
		||||
      lex(io, :html => true).should == [
 | 
			
		||||
      lex_stringio("<style>foo\nbar</style>", :html => true).should == [
 | 
			
		||||
        [:T_ELEM_START, nil, 1],
 | 
			
		||||
        [:T_ELEM_NAME, 'style', 1],
 | 
			
		||||
        [:T_TEXT, "foo\n", 1],
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue