root/extensions/gsdl-video/trunk/installed/cmdline/lib/ruby/1.8/rss/parser.rb @ 18425

Revision 18425, 12.0 KB (checked in by davidb, 11 years ago)

Video extension to Greenstone

Line 
1require "forwardable"
2require "open-uri"
3
4require "rss/rss"
5
6module RSS
7
8  class NotWellFormedError < Error
9    attr_reader :line, :element
10
11    # Create a new NotWellFormedError for an error at +line+
12    # in +element+.  If a block is given the return value of
13    # the block ends up in the error message.
14    def initialize(line=nil, element=nil)
15      message = "This is not well formed XML"
16      if element or line
17        message << "\nerror occurred"
18        message << " in #{element}" if element
19        message << " at about #{line} line" if line
20      end
21      message << "\n#{yield}" if block_given?
22      super(message)
23    end
24  end
25
26  class XMLParserNotFound < Error
27    def initialize
28      super("available XML parser was not found in " <<
29            "#{AVAILABLE_PARSER_LIBRARIES.inspect}.")
30    end
31  end
32
33  class NotValidXMLParser < Error
34    def initialize(parser)
35      super("#{parser} is not an available XML parser. " <<
36            "Available XML parser"<<
37            (AVAILABLE_PARSERS.size > 1 ? "s are ": " is ") <<
38            "#{AVAILABLE_PARSERS.inspect}.")
39    end
40  end
41
42  class NSError < InvalidRSSError
43    attr_reader :tag, :prefix, :uri
44    def initialize(tag, prefix, require_uri)
45      @tag, @prefix, @uri = tag, prefix, require_uri
46      super("prefix <#{prefix}> doesn't associate uri " <<
47            "<#{require_uri}> in tag <#{tag}>")
48    end
49  end
50
51  class Parser
52
53    extend Forwardable
54
55    class << self
56
57      @@default_parser = nil
58
59      def default_parser
60        @@default_parser || AVAILABLE_PARSERS.first
61      end
62
63      # Set @@default_parser to new_value if it is one of the
64      # available parsers. Else raise NotValidXMLParser error.
65      def default_parser=(new_value)
66        if AVAILABLE_PARSERS.include?(new_value)
67          @@default_parser = new_value
68        else
69          raise NotValidXMLParser.new(new_value)
70        end
71      end
72
73      def parse(rss, do_validate=true, ignore_unknown_element=true,
74                parser_class=default_parser)
75        parser = new(rss, parser_class)
76        parser.do_validate = do_validate
77        parser.ignore_unknown_element = ignore_unknown_element
78        parser.parse
79      end
80    end
81
82    def_delegators(:@parser, :parse, :rss,
83                   :ignore_unknown_element,
84                   :ignore_unknown_element=, :do_validate,
85                   :do_validate=)
86
87    def initialize(rss, parser_class=self.class.default_parser)
88      @parser = parser_class.new(normalize_rss(rss))
89    end
90
91    private
92
93    # Try to get the XML associated with +rss+.
94    # Return +rss+ if it already looks like XML, or treat it as a URI,
95    # or a file to get the XML,
96    def normalize_rss(rss)
97      return rss if maybe_xml?(rss)
98
99      uri = to_uri(rss)
100     
101      if uri.respond_to?(:read)
102        uri.read
103      elsif !rss.tainted? and File.readable?(rss)
104        File.open(rss) {|f| f.read}
105      else
106        rss
107      end
108    end
109
110    # maybe_xml? tests if source is a string that looks like XML.
111    def maybe_xml?(source)
112      source.is_a?(String) and /</ =~ source
113    end
114
115    # Attempt to convert rss to a URI, but just return it if
116    # there's a ::URI::Error
117    def to_uri(rss)
118      return rss if rss.is_a?(::URI::Generic)
119
120      begin
121        URI(rss)
122      rescue ::URI::Error
123        rss
124      end
125    end
126  end
127
128  class BaseParser
129
130    class << self
131      def raise_for_undefined_entity?
132        listener.raise_for_undefined_entity?
133      end
134    end
135   
136    def initialize(rss)
137      @listener = self.class.listener.new
138      @rss = rss
139    end
140
141    def rss
142      @listener.rss
143    end
144
145    def ignore_unknown_element
146      @listener.ignore_unknown_element
147    end
148
149    def ignore_unknown_element=(new_value)
150      @listener.ignore_unknown_element = new_value
151    end
152
153    def do_validate
154      @listener.do_validate
155    end
156
157    def do_validate=(new_value)
158      @listener.do_validate = new_value
159    end
160
161    def parse
162      if @listener.rss.nil?
163        _parse
164      end
165      @listener.rss
166    end
167
168  end
169
170  class BaseListener
171
172    extend Utils
173
174    class << self
175
176      @@setters = {}
177      @@registered_uris = {}
178      @@class_names = {}
179
180      # return the setter for the uri, tag_name pair, or nil.
181      def setter(uri, tag_name)
182        begin
183          @@setters[uri][tag_name]
184        rescue NameError
185          nil
186        end
187      end
188
189
190      # return the tag_names for setters associated with uri
191      def available_tags(uri)
192        begin
193          @@setters[uri].keys
194        rescue NameError
195          []
196        end
197      end
198     
199      # register uri against this name.
200      def register_uri(uri, name)
201        @@registered_uris[name] ||= {}
202        @@registered_uris[name][uri] = nil
203      end
204     
205      # test if this uri is registered against this name
206      def uri_registered?(uri, name)
207        @@registered_uris[name].has_key?(uri)
208      end
209
210      # record class_name for the supplied uri and tag_name
211      def install_class_name(uri, tag_name, class_name)
212        @@class_names[uri] ||= {}
213        @@class_names[uri][tag_name] = class_name
214      end
215
216      # retrieve class_name for the supplied uri and tag_name
217      # If it doesn't exist, capitalize the tag_name
218      def class_name(uri, tag_name)
219        begin
220          @@class_names[uri][tag_name]
221        rescue NameError
222          tag_name[0,1].upcase + tag_name[1..-1]
223        end
224      end
225
226      def install_get_text_element(uri, name, setter)
227        install_setter(uri, name, setter)
228        def_get_text_element(uri, name, *get_file_and_line_from_caller(1))
229      end
230     
231      def raise_for_undefined_entity?
232        true
233      end
234   
235      private
236      # set the setter for the uri, tag_name pair
237      def install_setter(uri, tag_name, setter)
238        @@setters[uri] ||= {}
239        @@setters[uri][tag_name] = setter
240      end
241
242      def def_get_text_element(uri, name, file, line)
243        register_uri(uri, name)
244        unless private_instance_methods(false).include?("start_#{name}")
245          module_eval(<<-EOT, file, line)
246          def start_#{name}(name, prefix, attrs, ns)
247            uri = _ns(ns, prefix)
248            if self.class.uri_registered?(uri, #{name.inspect})
249              start_get_text_element(name, prefix, ns, uri)
250            else
251              start_else_element(name, prefix, attrs, ns)
252            end
253          end
254          EOT
255          __send__("private", "start_#{name}")
256        end
257      end
258
259    end
260
261  end
262
263  module ListenerMixin
264
265    attr_reader :rss
266
267    attr_accessor :ignore_unknown_element
268    attr_accessor :do_validate
269
270    def initialize
271      @rss = nil
272      @ignore_unknown_element = true
273      @do_validate = true
274      @ns_stack = [{}]
275      @tag_stack = [[]]
276      @text_stack = ['']
277      @proc_stack = []
278      @last_element = nil
279      @version = @encoding = @standalone = nil
280      @xml_stylesheets = []
281    end
282   
283    # set instance vars for version, encoding, standalone
284    def xmldecl(version, encoding, standalone)
285      @version, @encoding, @standalone = version, encoding, standalone
286    end
287
288    def instruction(name, content)
289      if name == "xml-stylesheet"
290        params = parse_pi_content(content)
291        if params.has_key?("href")
292          @xml_stylesheets << XMLStyleSheet.new(*params)
293        end
294      end
295    end
296
297    def tag_start(name, attributes)
298      @text_stack.push('')
299
300      ns = @ns_stack.last.dup
301      attrs = {}
302      attributes.each do |n, v|
303        if /\Axmlns(?:\z|:)/ =~ n
304          ns[$POSTMATCH] = v
305        else
306          attrs[n] = v
307        end
308      end
309      @ns_stack.push(ns)
310
311      prefix, local = split_name(name)
312      @tag_stack.last.push([_ns(ns, prefix), local])
313      @tag_stack.push([])
314      if respond_to?("start_#{local}", true)
315        __send__("start_#{local}", local, prefix, attrs, ns.dup)
316      else
317        start_else_element(local, prefix, attrs, ns.dup)
318      end
319    end
320
321    def tag_end(name)
322      if DEBUG
323        p "end tag #{name}"
324        p @tag_stack
325      end
326      text = @text_stack.pop
327      tags = @tag_stack.pop
328      pr = @proc_stack.pop
329      pr.call(text, tags) unless pr.nil?
330      @ns_stack.pop
331    end
332
333    def text(data)
334      @text_stack.last << data
335    end
336
337    private
338    def _ns(ns, prefix)
339      ns.fetch(prefix, "")
340    end
341
342    CONTENT_PATTERN = /\s*([^=]+)=(["'])([^\2]+?)\2/
343    # Extract the first name="value" pair from content.
344    # Works with single quotes according to the constant
345    # CONTENT_PATTERN. Return a Hash.
346    def parse_pi_content(content)
347      params = {}
348      content.scan(CONTENT_PATTERN) do |name, quote, value|
349        params[name] = value
350      end
351      params
352    end
353
354    def start_else_element(local, prefix, attrs, ns)
355      class_name = self.class.class_name(_ns(ns, prefix), local)
356      current_class = @last_element.class
357      if current_class.constants.include?(class_name)
358        next_class = current_class.const_get(class_name)
359        start_have_something_element(local, prefix, attrs, ns, next_class)
360      else
361        if !@do_validate or @ignore_unknown_element
362          @proc_stack.push(nil)
363        else
364          parent = "ROOT ELEMENT???"
365          if current_class.tag_name
366            parent = current_class.tag_name
367          end
368          raise NotExpectedTagError.new(local, _ns(ns, prefix), parent)
369        end
370      end
371    end
372
373    NAMESPLIT = /^(?:([\w:][-\w\d.]*):)?([\w:][-\w\d.]*)/
374    def split_name(name)
375      name =~ NAMESPLIT
376      [$1 || '', $2]
377    end
378
379    def check_ns(tag_name, prefix, ns, require_uri)
380      if @do_validate
381        if _ns(ns, prefix) == require_uri
382          #ns.delete(prefix)
383        else
384          raise NSError.new(tag_name, prefix, require_uri)
385        end
386      end
387    end
388
389    def start_get_text_element(tag_name, prefix, ns, required_uri)
390      @proc_stack.push Proc.new {|text, tags|
391        setter = self.class.setter(required_uri, tag_name)
392        if @last_element.respond_to?(setter)
393          @last_element.__send__(setter, text.to_s)
394        else
395          if @do_validate and !@ignore_unknown_element
396            raise NotExpectedTagError.new(tag_name, _ns(ns, prefix),
397                                          @last_element.tag_name)
398          end
399        end
400      }
401    end
402
403    def start_have_something_element(tag_name, prefix, attrs, ns, klass)
404
405      check_ns(tag_name, prefix, ns, klass.required_uri)
406
407      attributes = {}
408      klass.get_attributes.each do |a_name, a_uri, required, element_name|
409
410        if a_uri.is_a?(String) or !a_uri.respond_to?(:include?)
411          a_uri = [a_uri]
412        end
413        unless a_uri == [""]
414          for prefix, uri in ns
415            if a_uri.include?(uri)
416              val = attrs["#{prefix}:#{a_name}"]
417              break if val
418            end
419          end
420        end
421        if val.nil? and a_uri.include?("")
422          val = attrs[a_name]
423        end
424
425        if @do_validate and required and val.nil?
426          unless a_uri.include?("")
427            for prefix, uri in ns
428              if a_uri.include?(uri)
429                a_name = "#{prefix}:#{a_name}"
430              end
431            end
432          end
433          raise MissingAttributeError.new(tag_name, a_name)
434        end
435
436        attributes[a_name] = val
437      end
438
439      previous = @last_element
440      next_element = klass.new(@do_validate, attributes)
441      previous.instance_eval {set_next_element(tag_name, next_element)}
442      @last_element = next_element
443      @proc_stack.push Proc.new { |text, tags|
444        p(@last_element.class) if DEBUG
445        @last_element.content = text if klass.have_content?
446        if @do_validate
447          @last_element.validate_for_stream(tags, @ignore_unknown_element)
448        end
449        @last_element = previous
450      }
451    end
452
453  end
454
455  unless const_defined? :AVAILABLE_PARSER_LIBRARIES
456    AVAILABLE_PARSER_LIBRARIES = [
457      ["rss/xmlparser", :XMLParserParser],
458      ["rss/xmlscanner", :XMLScanParser],
459      ["rss/rexmlparser", :REXMLParser],
460    ]
461  end
462
463  AVAILABLE_PARSERS = []
464
465  AVAILABLE_PARSER_LIBRARIES.each do |lib, parser|
466    begin
467      require lib
468      AVAILABLE_PARSERS.push(const_get(parser))
469    rescue LoadError
470    end
471  end
472
473  if AVAILABLE_PARSERS.empty?
474    raise XMLParserNotFound
475  end
476end
Note: See TracBrowser for help on using the browser.