From 317b49bcf64e4a93d4d90ed844f48ddb6e18e8fe Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Tue, 16 Sep 2014 14:30:46 +0200 Subject: [PATCH] Implemented a basic SAX API. This API is a little bit dodgy (similar to Nokogiri's API) due to the use of separate parser and handler classes. This is done to ensure that the return values of callback methods (e.g. on_element) aren't used by Racc for building AST trees. This also ensures that whatever variables are set by the handler don't conflict with any variables of the parser. This fixes #42. --- lib/oga.rb | 3 ++ lib/oga/html/sax_parser.rb | 18 +++++++++ lib/oga/oga.rb | 30 +++++++++++++++ lib/oga/xml/sax_parser.rb | 63 ++++++++++++++++++++++++++++++++ spec/oga/html/sax_parser_spec.rb | 22 +++++++++++ spec/oga/oga_spec.rb | 40 +++++++++++++++----- spec/oga/xml/sax_parser_spec.rb | 35 ++++++++++++++++++ 7 files changed, 202 insertions(+), 9 deletions(-) create mode 100644 lib/oga/html/sax_parser.rb create mode 100644 lib/oga/xml/sax_parser.rb create mode 100644 spec/oga/html/sax_parser_spec.rb create mode 100644 spec/oga/xml/sax_parser_spec.rb diff --git a/lib/oga.rb b/lib/oga.rb index 75cf725..5c18861 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -37,8 +37,11 @@ require_relative 'oga/xml/attribute' require_relative 'oga/xml/element' require_relative 'oga/xml/node_set' +require_relative 'oga/xml/sax_parser' require_relative 'oga/xml/pull_parser' + require_relative 'oga/html/parser' +require_relative 'oga/html/sax_parser' require_relative 'oga/xpath/node' require_relative 'oga/xpath/lexer' diff --git a/lib/oga/html/sax_parser.rb b/lib/oga/html/sax_parser.rb new file mode 100644 index 0000000..07f7731 --- /dev/null +++ b/lib/oga/html/sax_parser.rb @@ -0,0 +1,18 @@ +module Oga + module HTML + ## + # SAX parser for HTML documents. See the documentation of + # {Oga::XML::SaxParser} for more information. + # + class SaxParser < XML::SaxParser + ## + # @see [Oga::XML::SaxParser#initialize] + # + def initialize(handler, data, options = {}) + options = options.merge(:html => true) + + super(handler, data, options) + end + end # SaxParser + end # HTML +end # Oga diff --git a/lib/oga/oga.rb b/lib/oga/oga.rb index 7f40378..8cf78ca 100644 --- a/lib/oga/oga.rb +++ b/lib/oga/oga.rb @@ -24,4 +24,34 @@ module Oga def self.parse_html(html) return HTML::Parser.new(html).parse end + + ## + # Parses the given XML document using the SAX parser. + # + # @example + # handler = SomeSaxHandler.new + # + # Oga.sax_parse_html(handler, 'Hello') + # + # @param [Object] handler The SAX handler for the parser. + # @param [String|IO] xml The XML to parse. + # + def self.sax_parse_xml(handler, xml) + XML::SaxParser.new(handler, xml).parse + end + + ## + # Parses the given HTML document using the SAX parser. + # + # @example + # handler = SomeSaxHandler.new + # + # Oga.sax_parse_html(handler, '') + # + # @param [Object] handler The SAX handler for the parser. + # @param [String|IO] HTML The HTML to parse. + # + def self.sax_parse_html(handler, html) + HTML::SaxParser.new(handler, html).parse + end end # Oga diff --git a/lib/oga/xml/sax_parser.rb b/lib/oga/xml/sax_parser.rb new file mode 100644 index 0000000..47a9574 --- /dev/null +++ b/lib/oga/xml/sax_parser.rb @@ -0,0 +1,63 @@ +module Oga + module XML + ## + # The SaxParser class provides the basic interface for writing custom SAX + # parsers. All callback methods defined in {Oga::XML::Parser} are delegated + # to a dedicated handler class. + # + # To write a custom handler for the SAX parser, create a class that + # implements one (or many) of the following callback methods: + # + # * `on_document` + # * `on_doctype` + # * `on_cdata` + # * `on_comment` + # * `on_proc_ins` + # * `on_xml_decl` + # * `on_text` + # * `on_element` + # * `on_element_children` + # * `after_element` + # + # For example: + # + # class SaxHandler + # def on_element(namespace, name, attrs = {}) + # puts name + # end + # end + # + # You can then use it as following: + # + # handler = SaxHandler.new + # parser = Oga::XML::SaxParser.new(handler, '') + # + # parser.parse + # + # For information on the callback arguments see the documentation of the + # corresponding methods in {Oga::XML::Parser}. + # + class SaxParser < Parser + ## + # @param [Object] handler The SAX handler to delegate callbacks to. + # @see [Oga::XML::Parser#initialize] + # + def initialize(handler, *args) + @handler = handler + + super(*args) + end + + # Delegate all callbacks to the handler object. + instance_methods.grep(/^(on_|after_)/).each do |method| + eval <<-EOF, nil, __FILE__, __LINE__ + 1 + def #{method}(*args) + @handler.#{method}(*args) if @handler.respond_to?(:#{method}) + + return + end + EOF + end + end # SaxParser + end # XML +end # Oga diff --git a/spec/oga/html/sax_parser_spec.rb b/spec/oga/html/sax_parser_spec.rb new file mode 100644 index 0000000..c4c775c --- /dev/null +++ b/spec/oga/html/sax_parser_spec.rb @@ -0,0 +1,22 @@ +require 'spec_helper' + +describe Oga::HTML::SaxParser do + before do + @handler = Class.new do + attr_reader :name + + def on_element(namespace, name, attrs = {}) + @name = name + end + end + end + + example 'use custom callback methods if defined' do + handler = @handler.new + parser = described_class.new(handler, '') + + parser.parse + + handler.name.should == 'link' + end +end diff --git a/spec/oga/oga_spec.rb b/spec/oga/oga_spec.rb index 7f42554..569a788 100644 --- a/spec/oga/oga_spec.rb +++ b/spec/oga/oga_spec.rb @@ -1,19 +1,41 @@ require 'spec_helper' describe Oga do - context 'parse_xml' do - example 'parse an XML document' do - document = described_class.parse_xml('foo') + example 'parse an XML document' do + document = described_class.parse_xml('foo') - document.is_a?(Oga::XML::Document).should == true - end + document.is_a?(Oga::XML::Document).should == true end - context 'parse_html' do - example 'parse an HTML document' do - document = described_class.parse_xml('') + example 'parse an HTML document' do + document = described_class.parse_xml('') - document.is_a?(Oga::XML::Document).should == true + document.is_a?(Oga::XML::Document).should == true + end + + context 'SAX parsing' do + before do + klass = Class.new do + attr_reader :name + + def on_element(namespace, name, attrs = {}) + @name = name + end + end + + @handler = klass.new + end + + example 'parse an XML document using the SAX parser' do + Oga.sax_parse_xml(@handler, '') + + @handler.name.should == 'foo' + end + + example 'parse an HTML document using the SAX parser' do + Oga.sax_parse_xml(@handler, '') + + @handler.name.should == 'link' end end end diff --git a/spec/oga/xml/sax_parser_spec.rb b/spec/oga/xml/sax_parser_spec.rb new file mode 100644 index 0000000..03686d0 --- /dev/null +++ b/spec/oga/xml/sax_parser_spec.rb @@ -0,0 +1,35 @@ +require 'spec_helper' + +describe Oga::XML::SaxParser do + before do + @handler = Class.new do + attr_reader :name + + def on_element(namespace, name, attrs = {}) + @name = name + end + end + end + + example 'ignore return values of callback methods' do + parser = described_class.new(@handler.new, 'foo') + + parser.parse.should be_nil + end + + example 'use custom callback methods if defined' do + handler = @handler.new + parser = described_class.new(handler, '') + + parser.parse + + handler.name.should == 'foo' + end + + example 'ignore callbacks that are not defined in the handler' do + parser = described_class.new(@handler.new, '') + + # This would raise if undefined callbacks were _not_ ignored. + lambda { parser.parse }.should_not raise_error + end +end