Tighten lexing of T_TEXT nodes.
Thanks to some heavy rubberducking with @whitequark the lexer is now a little
bit better at lexing T_TEXT nodes. For example, previously the following could
not be lexed properly:
    "foo < bar"
There might still be some tweaking to do but we're getting there.
			
			
This commit is contained in:
		
							parent
							
								
									145315c26a
								
							
						
					
					
						commit
						49ddebf358
					
				| 
						 | 
					@ -37,7 +37,8 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    newline    = '\n' | '\r\n';
 | 
					    newline    = '\n' | '\r\n';
 | 
				
			||||||
    whitespace = [ \t];
 | 
					    whitespace = [ \t];
 | 
				
			||||||
    identifier = [a-zA-Z0-9\-_]+;
 | 
					    ident_char = [a-zA-Z0-9\-_];
 | 
				
			||||||
 | 
					    identifier = ident_char+;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Comments
 | 
					    # Comments
 | 
				
			||||||
    #
 | 
					    #
 | 
				
			||||||
| 
						 | 
					@ -209,13 +210,19 @@
 | 
				
			||||||
    # body of an element is lexed using the `main` machine.
 | 
					    # body of an element is lexed using the `main` machine.
 | 
				
			||||||
    #
 | 
					    #
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    element_end = '</' identifier (':' identifier)* '>';
 | 
					    element_start = '<' ident_char;
 | 
				
			||||||
 | 
					    element_end   = '</' identifier (':' identifier)* '>';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    action start_element {
 | 
					    action start_element {
 | 
				
			||||||
        callback_simple("on_element_start");
 | 
					        callback_simple("on_element_start");
 | 
				
			||||||
 | 
					        fhold;
 | 
				
			||||||
        fnext element_name;
 | 
					        fnext element_name;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    action close_element {
 | 
				
			||||||
 | 
					        callback_simple("on_element_end");
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Machine used for lexing the name/namespace of an element.
 | 
					    # Machine used for lexing the name/namespace of an element.
 | 
				
			||||||
    element_name := |*
 | 
					    element_name := |*
 | 
				
			||||||
        identifier ':' => {
 | 
					        identifier ':' => {
 | 
				
			||||||
| 
						 | 
					@ -262,6 +269,46 @@
 | 
				
			||||||
        };
 | 
					        };
 | 
				
			||||||
    *|;
 | 
					    *|;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Text
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    # http://www.w3.org/TR/xml/#syntax
 | 
				
			||||||
 | 
					    # http://www.w3.org/TR/html-markup/syntax.html#text-syntax
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    # Text content is everything leading up to certain special tags such as "</"
 | 
				
			||||||
 | 
					    # and "<?".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    action start_text {
 | 
				
			||||||
 | 
					        fhold;
 | 
				
			||||||
 | 
					        fnext text;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # These characters terminate a T_TEXT sequence and instruct Ragel to jump
 | 
				
			||||||
 | 
					    # back to the main machine.
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    # Note that this only works if each sequence is exactly 2 characters
 | 
				
			||||||
 | 
					    # long. Because of this "<!" is used instead of "<!--".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    terminate_text = '</' | '<!' | '<?' | element_start;
 | 
				
			||||||
 | 
					    allowed_text   = any* -- terminate_text;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    text := |*
 | 
				
			||||||
 | 
					        # Text followed by a special tag, such as "foo<!--"
 | 
				
			||||||
 | 
					        allowed_text @{ mark = p; } terminate_text => {
 | 
				
			||||||
 | 
					            callback("on_text", data, encoding, ts, mark);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            p    = mark - 1;
 | 
				
			||||||
 | 
					            mark = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            fnext main;
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Just regular text.
 | 
				
			||||||
 | 
					        allowed_text => {
 | 
				
			||||||
 | 
					            callback("on_text", data, encoding, ts, te);
 | 
				
			||||||
 | 
					            fnext main;
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					    *|;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # The main machine aka the entry point of Ragel.
 | 
					    # The main machine aka the entry point of Ragel.
 | 
				
			||||||
    main := |*
 | 
					    main := |*
 | 
				
			||||||
        doctype_start  => start_doctype;
 | 
					        doctype_start  => start_doctype;
 | 
				
			||||||
| 
						 | 
					@ -269,19 +316,8 @@
 | 
				
			||||||
        comment        => start_comment;
 | 
					        comment        => start_comment;
 | 
				
			||||||
        cdata          => start_cdata;
 | 
					        cdata          => start_cdata;
 | 
				
			||||||
        proc_ins_start => start_proc_ins;
 | 
					        proc_ins_start => start_proc_ins;
 | 
				
			||||||
 | 
					        element_start  => start_element;
 | 
				
			||||||
        # The start of an element.
 | 
					        element_end    => close_element;
 | 
				
			||||||
        '<' => start_element;
 | 
					        any            => start_text;
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Regular closing tags.
 | 
					 | 
				
			||||||
        element_end => {
 | 
					 | 
				
			||||||
            callback_simple("on_element_end");
 | 
					 | 
				
			||||||
        };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Treat everything else, except for "<", as regular text. The "<" sign
 | 
					 | 
				
			||||||
        # is used for tags so we can't emit text nodes for these characters.
 | 
					 | 
				
			||||||
        any+ -- '<' => {
 | 
					 | 
				
			||||||
            callback("on_text", data, encoding, ts, te);
 | 
					 | 
				
			||||||
        };
 | 
					 | 
				
			||||||
    *|;
 | 
					    *|;
 | 
				
			||||||
}%%
 | 
					}%%
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue