source: extensions/gsdl-video/trunk/installed/cmdline/lib/ruby/1.8/rexml/parsers/baseparser.rb@ 18425

Last change on this file since 18425 was 18425, checked in by davidb, 15 years ago

Video extension to Greenstone

File size: 17.0 KB
Line 
1require 'rexml/parseexception'
2require 'rexml/source'
3
4module REXML
5 module Parsers
6 # = Using the Pull Parser
7 # <em>This API is experimental, and subject to change.</em>
8 # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
9 # while parser.has_next?
10 # res = parser.next
11 # puts res[1]['att'] if res.start_tag? and res[0] == 'b'
12 # end
13 # See the PullEvent class for information on the content of the results.
14 # The data is identical to the arguments passed for the various events to
15 # the StreamListener API.
16 #
17 # Notice that:
18 # parser = PullParser.new( "<a>BAD DOCUMENT" )
19 # while parser.has_next?
20 # res = parser.next
21 # raise res[1] if res.error?
22 # end
23 #
24 # Nat Price gave me some good ideas for the API.
25 class BaseParser
26 NCNAME_STR= '[\w:][\-\w\d.]*'
27 NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
28
29 NAMECHAR = '[\-\w\d\.:]'
30 NAME = "([\\w:]#{NAMECHAR}*)"
31 NMTOKEN = "(?:#{NAMECHAR})+"
32 NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
33 REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
34 REFERENCE_RE = /#{REFERENCE}/
35
36 DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
37 DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
38 ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um
39 COMMENT_START = /\A<!--/u
40 COMMENT_PATTERN = /<!--(.*?)-->/um
41 CDATA_START = /\A<!\[CDATA\[/u
42 CDATA_END = /^\s*\]\s*>/um
43 CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
44 XMLDECL_START = /\A<\?xml\s/u;
45 XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
46 INSTRUCTION_START = /\A<\?/u
47 INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
48 TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um
49 CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
50
51 VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
52 ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
53 STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
54
55 ENTITY_START = /^\s*<!ENTITY/
56 IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'].*?['"])?(\s+['"].*?["'])?/u
57 ELEMENTDECL_START = /^\s*<!ELEMENT/um
58 ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
59 SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
60 ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
61 NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
62 ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
63 ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
64 ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
65 DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
66 ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
67 ATTDEF_RE = /#{ATTDEF}/
68 ATTLISTDECL_START = /^\s*<!ATTLIST/um
69 ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
70 NOTATIONDECL_START = /^\s*<!NOTATION/um
71 PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
72 SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
73
74 TEXT_PATTERN = /\A([^<]*)/um
75
76 # Entity constants
77 PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
78 SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
79 PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
80 EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
81 NDATADECL = "\\s+NDATA\\s+#{NAME}"
82 PEREFERENCE = "%#{NAME};"
83 ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
84 PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
85 ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
86 PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
87 GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
88 ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
89
90 EREFERENCE = /&(?!#{NAME};)/
91
92 DEFAULT_ENTITIES = {
93 'gt' => [/&gt;/, '&gt;', '>', />/],
94 'lt' => [/&lt;/, '&lt;', '<', /</],
95 'quot' => [/&quot;/, '&quot;', '"', /"/],
96 "apos" => [/&apos;/, "&apos;", "'", /'/]
97 }
98
99
100 ######################################################################
101 # These are patterns to identify common markup errors, to make the
102 # error messages more informative.
103 ######################################################################
104 MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
105
106 def initialize( source )
107 self.stream = source
108 end
109
110 def add_listener( listener )
111 if !defined?(@listeners) or !@listeners
112 @listeners = []
113 instance_eval <<-EOL
114 alias :_old_pull :pull
115 def pull
116 event = _old_pull
117 @listeners.each do |listener|
118 listener.receive event
119 end
120 event
121 end
122 EOL
123 end
124 @listeners << listener
125 end
126
127 attr_reader :source
128
129 def stream=( source )
130 @source = SourceFactory.create_from( source )
131 @closed = nil
132 @document_status = nil
133 @tags = []
134 @stack = []
135 @entities = []
136 end
137
138 def position
139 if @source.respond_to? :position
140 @source.position
141 else
142 # FIXME
143 0
144 end
145 end
146
147 # Returns true if there are no more events
148 def empty?
149 return (@source.empty? and @stack.empty?)
150 end
151
152 # Returns true if there are more events. Synonymous with !empty?
153 def has_next?
154 return !(@source.empty? and @stack.empty?)
155 end
156
157 # Push an event back on the head of the stream. This method
158 # has (theoretically) infinite depth.
159 def unshift token
160 @stack.unshift(token)
161 end
162
163 # Peek at the +depth+ event in the stack. The first element on the stack
164 # is at depth 0. If +depth+ is -1, will parse to the end of the input
165 # stream and return the last event, which is always :end_document.
166 # Be aware that this causes the stream to be parsed up to the +depth+
167 # event, so you can effectively pre-parse the entire document (pull the
168 # entire thing into memory) using this method.
169 def peek depth=0
170 raise %Q[Illegal argument "#{depth}"] if depth < -1
171 temp = []
172 if depth == -1
173 temp.push(pull()) until empty?
174 else
175 while @stack.size+temp.size < depth+1
176 temp.push(pull())
177 end
178 end
179 @stack += temp if temp.size > 0
180 @stack[depth]
181 end
182
183 # Returns the next event. This is a +PullEvent+ object.
184 def pull
185 if @closed
186 x, @closed = @closed, nil
187 return [ :end_element, x ]
188 end
189 return [ :end_document ] if empty?
190 return @stack.shift if @stack.size > 0
191 @source.read if @source.buffer.size<2
192 #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
193 if @document_status == nil
194 #@source.consume( /^\s*/um )
195 word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
196 word = word[1] unless word.nil?
197 #STDERR.puts "WORD = #{word.inspect}"
198 case word
199 when COMMENT_START
200 return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
201 when XMLDECL_START
202 #STDERR.puts "XMLDECL"
203 results = @source.match( XMLDECL_PATTERN, true )[1]
204 version = VERSION.match( results )
205 version = version[1] unless version.nil?
206 encoding = ENCODING.match(results)
207 encoding = encoding[1] unless encoding.nil?
208 @source.encoding = encoding
209 standalone = STANDALONE.match(results)
210 standalone = standalone[1] unless standalone.nil?
211 return [ :xmldecl, version, encoding, standalone ]
212 when INSTRUCTION_START
213 return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
214 when DOCTYPE_START
215 md = @source.match( DOCTYPE_PATTERN, true )
216 identity = md[1]
217 close = md[2]
218 identity =~ IDENTITY
219 name = $1
220 raise REXML::ParseException("DOCTYPE is missing a name") if name.nil?
221 pub_sys = $2.nil? ? nil : $2.strip
222 long_name = $3.nil? ? nil : $3.strip
223 uri = $4.nil? ? nil : $4.strip
224 args = [ :start_doctype, name, pub_sys, long_name, uri ]
225 if close == ">"
226 @document_status = :after_doctype
227 @source.read if @source.buffer.size<2
228 md = @source.match(/^\s*/um, true)
229 @stack << [ :end_doctype ]
230 else
231 @document_status = :in_doctype
232 end
233 return args
234 when /^\s+/
235 else
236 @document_status = :after_doctype
237 @source.read if @source.buffer.size<2
238 md = @source.match(/\s*/um, true)
239 end
240 end
241 if @document_status == :in_doctype
242 md = @source.match(/\s*(.*?>)/um)
243 case md[1]
244 when SYSTEMENTITY
245 match = @source.match( SYSTEMENTITY, true )[1]
246 return [ :externalentity, match ]
247
248 when ELEMENTDECL_START
249 return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
250
251 when ENTITY_START
252 match = @source.match( ENTITYDECL, true ).to_a.compact
253 match[0] = :entitydecl
254 ref = false
255 if match[1] == '%'
256 ref = true
257 match.delete_at 1
258 end
259 # Now we have to sort out what kind of entity reference this is
260 if match[2] == 'SYSTEM'
261 # External reference
262 match[3] = match[3][1..-2] # PUBID
263 match.delete_at(4) if match.size > 4 # Chop out NDATA decl
264 # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
265 elsif match[2] == 'PUBLIC'
266 # External reference
267 match[3] = match[3][1..-2] # PUBID
268 match[4] = match[4][1..-2] # HREF
269 # match is [ :entity, name, PUBLIC, pubid, href ]
270 else
271 match[2] = match[2][1..-2]
272 match.pop if match.size == 4
273 # match is [ :entity, name, value ]
274 end
275 match << '%' if ref
276 return match
277 when ATTLISTDECL_START
278 md = @source.match( ATTLISTDECL_PATTERN, true )
279 raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
280 element = md[1]
281 contents = md[0]
282
283 pairs = {}
284 values = md[0].scan( ATTDEF_RE )
285 values.each do |attdef|
286 unless attdef[3] == "#IMPLIED"
287 attdef.compact!
288 val = attdef[3]
289 val = attdef[4] if val == "#FIXED "
290 pairs[attdef[0]] = val
291 end
292 end
293 return [ :attlistdecl, element, pairs, contents ]
294 when NOTATIONDECL_START
295 md = nil
296 if @source.match( PUBLIC )
297 md = @source.match( PUBLIC, true )
298 vals = [md[1],md[2],md[4],md[6]]
299 elsif @source.match( SYSTEM )
300 md = @source.match( SYSTEM, true )
301 vals = [md[1],md[2],nil,md[4]]
302 else
303 raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
304 end
305 return [ :notationdecl, *vals ]
306 when CDATA_END
307 @document_status = :after_doctype
308 @source.match( CDATA_END, true )
309 return [ :end_doctype ]
310 end
311 end
312 begin
313 if @source.buffer[0] == ?<
314 if @source.buffer[1] == ?/
315 last_tag = @tags.pop
316 #md = @source.match_to_consume( '>', CLOSE_MATCH)
317 md = @source.match( CLOSE_MATCH, true )
318 raise REXML::ParseException.new( "Missing end tag for "+
319 "'#{last_tag}' (got \"#{md[1]}\")",
320 @source) unless last_tag == md[1]
321 return [ :end_element, last_tag ]
322 elsif @source.buffer[1] == ?!
323 md = @source.match(/\A(\s*[^>]*>)/um)
324 #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
325 raise REXML::ParseException.new("Malformed node", @source) unless md
326 if md[0][2] == ?-
327 md = @source.match( COMMENT_PATTERN, true )
328 return [ :comment, md[1] ] if md
329 else
330 md = @source.match( CDATA_PATTERN, true )
331 return [ :cdata, md[1] ] if md
332 end
333 raise REXML::ParseException.new( "Declarations can only occur "+
334 "in the doctype declaration.", @source)
335 elsif @source.buffer[1] == ??
336 md = @source.match( INSTRUCTION_PATTERN, true )
337 return [ :processing_instruction, md[1], md[2] ] if md
338 raise REXML::ParseException.new( "Bad instruction declaration",
339 @source)
340 else
341 # Get the next tag
342 md = @source.match(TAG_MATCH, true)
343 unless md
344 # Check for missing attribute quotes
345 raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
346 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
347 end
348 attrs = []
349 if md[2].size > 0
350 attrs = md[2].scan( ATTRIBUTE_PATTERN )
351 raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
352 end
353
354 if md[4]
355 @closed = md[1]
356 else
357 @tags.push( md[1] )
358 end
359 attributes = {}
360 attrs.each { |a,b,c| attributes[a] = c }
361 return [ :start_element, md[1], attributes ]
362 end
363 else
364 md = @source.match( TEXT_PATTERN, true )
365 if md[0].length == 0
366 @source.match( /(\s+)/, true )
367 end
368 #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
369 #return [ :text, "" ] if md[0].length == 0
370 # unnormalized = Text::unnormalize( md[1], self )
371 # return PullEvent.new( :text, md[1], unnormalized )
372 return [ :text, md[1] ]
373 end
374 rescue REXML::ParseException
375 raise
376 rescue Exception, NameError => error
377 raise REXML::ParseException.new( "Exception parsing",
378 @source, self, (error ? error : $!) )
379 end
380 return [ :dummy ]
381 end
382
383 def entity( reference, entities )
384 value = nil
385 value = entities[ reference ] if entities
386 if not value
387 value = DEFAULT_ENTITIES[ reference ]
388 value = value[2] if value
389 end
390 unnormalize( value, entities ) if value
391 end
392
393 # Escapes all possible entities
394 def normalize( input, entities=nil, entity_filter=nil )
395 copy = input.clone
396 # Doing it like this rather than in a loop improves the speed
397 copy.gsub!( EREFERENCE, '&amp;' )
398 entities.each do |key, value|
399 copy.gsub!( value, "&#{key};" ) unless entity_filter and
400 entity_filter.include?(entity)
401 end if entities
402 copy.gsub!( EREFERENCE, '&amp;' )
403 DEFAULT_ENTITIES.each do |key, value|
404 copy.gsub!( value[3], value[1] )
405 end
406 copy
407 end
408
409 # Unescapes all possible entities
410 def unnormalize( string, entities=nil, filter=nil )
411 rv = string.clone
412 rv.gsub!( /\r\n?/, "\n" )
413 matches = rv.scan( REFERENCE_RE )
414 return rv if matches.size == 0
415 rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
416 m=$1
417 m = "0#{m}" if m[0] == ?x
418 [Integer(m)].pack('U*')
419 }
420 matches.collect!{|x|x[0]}.compact!
421 if matches.size > 0
422 matches.each do |entity_reference|
423 unless filter and filter.include?(entity_reference)
424 entity_value = entity( entity_reference, entities )
425 if entity_value
426 re = /&#{entity_reference};/
427 rv.gsub!( re, entity_value )
428 end
429 end
430 end
431 matches.each do |entity_reference|
432 unless filter and filter.include?(entity_reference)
433 er = DEFAULT_ENTITIES[entity_reference]
434 rv.gsub!( er[0], er[2] ) if er
435 end
436 end
437 rv.gsub!( /&amp;/, '&' )
438 end
439 rv
440 end
441 end
442 end
443end
444
445=begin
446 case event[0]
447 when :start_element
448 when :text
449 when :end_element
450 when :processing_instruction
451 when :cdata
452 when :comment
453 when :xmldecl
454 when :start_doctype
455 when :end_doctype
456 when :externalentity
457 when :elementdecl
458 when :entity
459 when :attlistdecl
460 when :notationdecl
461 when :end_doctype
462 end
463=end
Note: See TracBrowser for help on using the repository browser.