[18425] | 1 | require 'rexml/entity'
|
---|
| 2 | require 'rexml/doctype'
|
---|
| 3 | require 'rexml/child'
|
---|
| 4 | require 'rexml/doctype'
|
---|
| 5 | require 'rexml/parseexception'
|
---|
| 6 |
|
---|
| 7 | module REXML
|
---|
| 8 | # Represents text nodes in an XML document
|
---|
| 9 | class Text < Child
|
---|
| 10 | include Comparable
|
---|
| 11 | # The order in which the substitutions occur
|
---|
| 12 | SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
|
---|
| 13 | SUBSTITUTES = ['&', '<', '>', '"', ''', ' ']
|
---|
| 14 | # Characters which are substituted in written strings
|
---|
| 15 | SLAICEPS = [ '<', '>', '"', "'", '&' ]
|
---|
| 16 | SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ]
|
---|
| 17 |
|
---|
| 18 | # If +raw+ is true, then REXML leaves the value alone
|
---|
| 19 | attr_accessor :raw
|
---|
| 20 |
|
---|
| 21 | ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
|
---|
| 22 | NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/
|
---|
| 23 |
|
---|
| 24 | # Constructor
|
---|
| 25 | # +arg+ if a String, the content is set to the String. If a Text,
|
---|
| 26 | # the object is shallowly cloned.
|
---|
| 27 | #
|
---|
| 28 | # +respect_whitespace+ (boolean, false) if true, whitespace is
|
---|
| 29 | # respected
|
---|
| 30 | #
|
---|
| 31 | # +parent+ (nil) if this is a Parent object, the parent
|
---|
| 32 | # will be set to this.
|
---|
| 33 | #
|
---|
| 34 | # +raw+ (nil) This argument can be given three values.
|
---|
| 35 | # If true, then the value of used to construct this object is expected to
|
---|
| 36 | # contain no unescaped XML markup, and REXML will not change the text. If
|
---|
| 37 | # this value is false, the string may contain any characters, and REXML will
|
---|
| 38 | # escape any and all defined entities whose values are contained in the
|
---|
| 39 | # text. If this value is nil (the default), then the raw value of the
|
---|
| 40 | # parent will be used as the raw value for this node. If there is no raw
|
---|
| 41 | # value for the parent, and no value is supplied, the default is false.
|
---|
| 42 | # Use this field if you have entities defined for some text, and you don't
|
---|
| 43 | # want REXML to escape that text in output.
|
---|
| 44 | # Text.new( "<&", false, nil, false ) #-> "<&"
|
---|
| 45 | # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
|
---|
| 46 | # Text.new( "<&", false, nil, true ) #-> Parse exception
|
---|
| 47 | # Text.new( "<&", false, nil, true ) #-> "<&"
|
---|
| 48 | # # Assume that the entity "s" is defined to be "sean"
|
---|
| 49 | # # and that the entity "r" is defined to be "russell"
|
---|
| 50 | # Text.new( "sean russell" ) #-> "&s; &r;"
|
---|
| 51 | # Text.new( "sean russell", false, nil, true ) #-> "sean russell"
|
---|
| 52 | #
|
---|
| 53 | # +entity_filter+ (nil) This can be an array of entities to match in the
|
---|
| 54 | # supplied text. This argument is only useful if +raw+ is set to false.
|
---|
| 55 | # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
|
---|
| 56 | # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
|
---|
| 57 | # In the last example, the +entity_filter+ argument is ignored.
|
---|
| 58 | #
|
---|
| 59 | # +pattern+ INTERNAL USE ONLY
|
---|
| 60 | def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
|
---|
| 61 | entity_filter=nil, illegal=ILLEGAL )
|
---|
| 62 |
|
---|
| 63 | @raw = false
|
---|
| 64 |
|
---|
| 65 | if parent
|
---|
| 66 | super( parent )
|
---|
| 67 | @raw = parent.raw
|
---|
| 68 | else
|
---|
| 69 | @parent = nil
|
---|
| 70 | end
|
---|
| 71 |
|
---|
| 72 | @raw = raw unless raw.nil?
|
---|
| 73 | @entity_filter = entity_filter
|
---|
| 74 | @normalized = @unnormalized = nil
|
---|
| 75 |
|
---|
| 76 | if arg.kind_of? String
|
---|
| 77 | @string = arg.clone
|
---|
| 78 | @string.squeeze!(" \n\t") unless respect_whitespace
|
---|
| 79 | elsif arg.kind_of? Text
|
---|
| 80 | @string = arg.to_s
|
---|
| 81 | @raw = arg.raw
|
---|
| 82 | elsif
|
---|
| 83 | raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
|
---|
| 84 | end
|
---|
| 85 |
|
---|
| 86 | @string.gsub!( /\r\n?/, "\n" )
|
---|
| 87 |
|
---|
| 88 | # check for illegal characters
|
---|
| 89 | if @raw
|
---|
| 90 | if @string =~ illegal
|
---|
| 91 | raise "Illegal character '#{$1}' in raw string \"#{@string}\""
|
---|
| 92 | end
|
---|
| 93 | end
|
---|
| 94 | end
|
---|
| 95 |
|
---|
| 96 | def node_type
|
---|
| 97 | :text
|
---|
| 98 | end
|
---|
| 99 |
|
---|
| 100 | def empty?
|
---|
| 101 | @string.size==0
|
---|
| 102 | end
|
---|
| 103 |
|
---|
| 104 |
|
---|
| 105 | def clone
|
---|
| 106 | return Text.new(self)
|
---|
| 107 | end
|
---|
| 108 |
|
---|
| 109 |
|
---|
| 110 | # Appends text to this text node. The text is appended in the +raw+ mode
|
---|
| 111 | # of this text node.
|
---|
| 112 | def <<( to_append )
|
---|
| 113 | @string << to_append.gsub( /\r\n?/, "\n" )
|
---|
| 114 | end
|
---|
| 115 |
|
---|
| 116 |
|
---|
| 117 | # +other+ a String or a Text
|
---|
| 118 | # +returns+ the result of (to_s <=> arg.to_s)
|
---|
| 119 | def <=>( other )
|
---|
| 120 | to_s() <=> other.to_s
|
---|
| 121 | end
|
---|
| 122 |
|
---|
| 123 | REFERENCE = /#{Entity::REFERENCE}/
|
---|
| 124 | # Returns the string value of this text node. This string is always
|
---|
| 125 | # escaped, meaning that it is a valid XML text node string, and all
|
---|
| 126 | # entities that can be escaped, have been inserted. This method respects
|
---|
| 127 | # the entity filter set in the constructor.
|
---|
| 128 | #
|
---|
| 129 | # # Assume that the entity "s" is defined to be "sean", and that the
|
---|
| 130 | # # entity "r" is defined to be "russell"
|
---|
| 131 | # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
|
---|
| 132 | # t.to_s #-> "< & &s; russell"
|
---|
| 133 | # t = Text.new( "< & &s; russell", false, nil, false )
|
---|
| 134 | # t.to_s #-> "< & &s; russell"
|
---|
| 135 | # u = Text.new( "sean russell", false, nil, true )
|
---|
| 136 | # u.to_s #-> "sean russell"
|
---|
| 137 | def to_s
|
---|
| 138 | return @string if @raw
|
---|
| 139 | return @normalized if @normalized
|
---|
| 140 |
|
---|
| 141 | doctype = nil
|
---|
| 142 | if @parent
|
---|
| 143 | doc = @parent.document
|
---|
| 144 | doctype = doc.doctype if doc
|
---|
| 145 | end
|
---|
| 146 |
|
---|
| 147 | @normalized = Text::normalize( @string, doctype, @entity_filter )
|
---|
| 148 | end
|
---|
| 149 |
|
---|
| 150 | def inspect
|
---|
| 151 | @string.inspect
|
---|
| 152 | end
|
---|
| 153 |
|
---|
| 154 | # Returns the string value of this text. This is the text without
|
---|
| 155 | # entities, as it might be used programmatically, or printed to the
|
---|
| 156 | # console. This ignores the 'raw' attribute setting, and any
|
---|
| 157 | # entity_filter.
|
---|
| 158 | #
|
---|
| 159 | # # Assume that the entity "s" is defined to be "sean", and that the
|
---|
| 160 | # # entity "r" is defined to be "russell"
|
---|
| 161 | # t = Text.new( "< & sean russell", false, nil, false, ['s'] )
|
---|
| 162 | # t.value #-> "< & sean russell"
|
---|
| 163 | # t = Text.new( "< & &s; russell", false, nil, false )
|
---|
| 164 | # t.value #-> "< & sean russell"
|
---|
| 165 | # u = Text.new( "sean russell", false, nil, true )
|
---|
| 166 | # u.value #-> "sean russell"
|
---|
| 167 | def value
|
---|
| 168 | @unnormalized if @unnormalized
|
---|
| 169 | doctype = nil
|
---|
| 170 | if @parent
|
---|
| 171 | doc = @parent.document
|
---|
| 172 | doctype = doc.doctype if doc
|
---|
| 173 | end
|
---|
| 174 | @unnormalized = Text::unnormalize( @string, doctype )
|
---|
| 175 | end
|
---|
| 176 |
|
---|
| 177 | # Sets the contents of this text node. This expects the text to be
|
---|
| 178 | # unnormalized. It returns self.
|
---|
| 179 | #
|
---|
| 180 | # e = Element.new( "a" )
|
---|
| 181 | # e.add_text( "foo" ) # <a>foo</a>
|
---|
| 182 | # e[0].value = "bar" # <a>bar</a>
|
---|
| 183 | # e[0].value = "<a>" # <a><a></a>
|
---|
| 184 | def value=( val )
|
---|
| 185 | @string = val.gsub( /\r\n?/, "\n" )
|
---|
| 186 | @unnormalized = nil
|
---|
| 187 | @normalized = nil
|
---|
| 188 | @raw = false
|
---|
| 189 | end
|
---|
| 190 |
|
---|
| 191 | def wrap(string, width, addnewline=false)
|
---|
| 192 | # Recursivly wrap string at width.
|
---|
| 193 | return string if string.length <= width
|
---|
| 194 | place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
|
---|
| 195 | if addnewline then
|
---|
| 196 | return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
|
---|
| 197 | else
|
---|
| 198 | return string[0,place] + "\n" + wrap(string[place+1..-1], width)
|
---|
| 199 | end
|
---|
| 200 | end
|
---|
| 201 |
|
---|
| 202 | def indent_text(string, level=1, style="\t", indentfirstline=true)
|
---|
| 203 | return string if level < 0
|
---|
| 204 | new_string = ''
|
---|
| 205 | string.each { |line|
|
---|
| 206 | indent_string = style * level
|
---|
| 207 | new_line = (indent_string + line).sub(/[\s]+$/,'')
|
---|
| 208 | new_string << new_line
|
---|
| 209 | }
|
---|
| 210 | new_string.strip! unless indentfirstline
|
---|
| 211 | return new_string
|
---|
| 212 | end
|
---|
| 213 |
|
---|
| 214 | def write( writer, indent=-1, transitive=false, ie_hack=false )
|
---|
| 215 | s = to_s()
|
---|
| 216 | if not (@parent and @parent.whitespace) then
|
---|
| 217 | s = wrap(s, 60, false) if @parent and @parent.context[:wordwrap] == :all
|
---|
| 218 | if @parent and not @parent.context[:indentstyle].nil? and indent > 0 and s.count("\n") > 0
|
---|
| 219 | s = indent_text(s, indent, @parent.context[:indentstyle], false)
|
---|
| 220 | end
|
---|
| 221 | s.squeeze!(" \n\t") if @parent and [email protected]
|
---|
| 222 | end
|
---|
| 223 | writer << s
|
---|
| 224 | end
|
---|
| 225 |
|
---|
| 226 | # FIXME
|
---|
| 227 | # This probably won't work properly
|
---|
| 228 | def xpath
|
---|
| 229 | path = @parent.xpath
|
---|
| 230 | path += "/text()"
|
---|
| 231 | return path
|
---|
| 232 | end
|
---|
| 233 |
|
---|
| 234 | # Writes out text, substituting special characters beforehand.
|
---|
| 235 | # +out+ A String, IO, or any other object supporting <<( String )
|
---|
| 236 | # +input+ the text to substitute and the write out
|
---|
| 237 | #
|
---|
| 238 | # z=utf8.unpack("U*")
|
---|
| 239 | # ascOut=""
|
---|
| 240 | # z.each{|r|
|
---|
| 241 | # if r < 0x100
|
---|
| 242 | # ascOut.concat(r.chr)
|
---|
| 243 | # else
|
---|
| 244 | # ascOut.concat(sprintf("&#x%x;", r))
|
---|
| 245 | # end
|
---|
| 246 | # }
|
---|
| 247 | # puts ascOut
|
---|
| 248 | def write_with_substitution out, input
|
---|
| 249 | copy = input.clone
|
---|
| 250 | # Doing it like this rather than in a loop improves the speed
|
---|
| 251 | copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
|
---|
| 252 | copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
|
---|
| 253 | copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
|
---|
| 254 | copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
|
---|
| 255 | copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
|
---|
| 256 | copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
|
---|
| 257 | out << copy
|
---|
| 258 | end
|
---|
| 259 |
|
---|
| 260 | # Reads text, substituting entities
|
---|
| 261 | def Text::read_with_substitution( input, illegal=nil )
|
---|
| 262 | copy = input.clone
|
---|
| 263 |
|
---|
| 264 | if copy =~ illegal
|
---|
| 265 | raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
|
---|
| 266 | end if illegal
|
---|
| 267 |
|
---|
| 268 | copy.gsub!( /\r\n?/, "\n" )
|
---|
| 269 | if copy.include? ?&
|
---|
| 270 | copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
|
---|
| 271 | copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
|
---|
| 272 | copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
|
---|
| 273 | copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
|
---|
| 274 | copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
|
---|
| 275 | copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) {|m|
|
---|
| 276 | m=$1
|
---|
| 277 | #m='0' if m==''
|
---|
| 278 | m = "0#{m}" if m[0] == ?x
|
---|
| 279 | [Integer(m)].pack('U*')
|
---|
| 280 | }
|
---|
| 281 | end
|
---|
| 282 | copy
|
---|
| 283 | end
|
---|
| 284 |
|
---|
| 285 | EREFERENCE = /&(?!#{Entity::NAME};)/
|
---|
| 286 | # Escapes all possible entities
|
---|
| 287 | def Text::normalize( input, doctype=nil, entity_filter=nil )
|
---|
| 288 | copy = input
|
---|
| 289 | # Doing it like this rather than in a loop improves the speed
|
---|
| 290 | #copy = copy.gsub( EREFERENCE, '&' )
|
---|
| 291 | copy = copy.gsub( "&", "&" )
|
---|
| 292 | if doctype
|
---|
| 293 | # Replace all ampersands that aren't part of an entity
|
---|
| 294 | doctype.entities.each_value do |entity|
|
---|
| 295 | copy = copy.gsub( entity.value,
|
---|
| 296 | "&#{entity.name};" ) if entity.value and
|
---|
| 297 | not( entity_filter and entity_filter.include?(entity) )
|
---|
| 298 | end
|
---|
| 299 | else
|
---|
| 300 | # Replace all ampersands that aren't part of an entity
|
---|
| 301 | DocType::DEFAULT_ENTITIES.each_value do |entity|
|
---|
| 302 | copy = copy.gsub(entity.value, "&#{entity.name};" )
|
---|
| 303 | end
|
---|
| 304 | end
|
---|
| 305 | copy
|
---|
| 306 | end
|
---|
| 307 |
|
---|
| 308 | # Unescapes all possible entities
|
---|
| 309 | def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
|
---|
| 310 | rv = string.clone
|
---|
| 311 | rv.gsub!( /\r\n?/, "\n" )
|
---|
| 312 | matches = rv.scan( REFERENCE )
|
---|
| 313 | return rv if matches.size == 0
|
---|
| 314 | rv.gsub!( NUMERICENTITY ) {|m|
|
---|
| 315 | m=$1
|
---|
| 316 | m = "0#{m}" if m[0] == ?x
|
---|
| 317 | [Integer(m)].pack('U*')
|
---|
| 318 | }
|
---|
| 319 | matches.collect!{|x|x[0]}.compact!
|
---|
| 320 | if matches.size > 0
|
---|
| 321 | if doctype
|
---|
| 322 | matches.each do |entity_reference|
|
---|
| 323 | unless filter and filter.include?(entity_reference)
|
---|
| 324 | entity_value = doctype.entity( entity_reference )
|
---|
| 325 | re = /&#{entity_reference};/
|
---|
| 326 | rv.gsub!( re, entity_value ) if entity_value
|
---|
| 327 | end
|
---|
| 328 | end
|
---|
| 329 | else
|
---|
| 330 | matches.each do |entity_reference|
|
---|
| 331 | unless filter and filter.include?(entity_reference)
|
---|
| 332 | entity_value = DocType::DEFAULT_ENTITIES[ entity_reference ]
|
---|
| 333 | re = /&#{entity_reference};/
|
---|
| 334 | rv.gsub!( re, entity_value.value ) if entity_value
|
---|
| 335 | end
|
---|
| 336 | end
|
---|
| 337 | end
|
---|
| 338 | rv.gsub!( /&/, '&' )
|
---|
| 339 | end
|
---|
| 340 | rv
|
---|
| 341 | end
|
---|
| 342 | end
|
---|
| 343 | end
|
---|