1 | """Beautiful Soup
|
---|
2 | Elixir and Tonic
|
---|
3 | "The Screen-Scraper's Friend"
|
---|
4 | http://www.crummy.com/software/BeautifulSoup/
|
---|
5 |
|
---|
6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a
|
---|
7 | tree representation. It provides methods and Pythonic idioms that make
|
---|
8 | it easy to navigate, search, and modify the tree.
|
---|
9 |
|
---|
10 | A well-formed XML/HTML document yields a well-formed data
|
---|
11 | structure. An ill-formed XML/HTML document yields a correspondingly
|
---|
12 | ill-formed data structure. If your document is only locally
|
---|
13 | well-formed, you can use this library to find and process the
|
---|
14 | well-formed part of it.
|
---|
15 |
|
---|
16 | Beautiful Soup works with Python 2.2 and up. It has no external
|
---|
17 | dependencies, but you'll have more success at converting data to UTF-8
|
---|
18 | if you also install these three packages:
|
---|
19 |
|
---|
20 | * chardet, for auto-detecting character encodings
|
---|
21 | http://chardet.feedparser.org/
|
---|
22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported
|
---|
23 | by stock Python.
|
---|
24 | http://cjkpython.i18n.org/
|
---|
25 |
|
---|
26 | Beautiful Soup defines classes for two main parsing strategies:
|
---|
27 |
|
---|
28 | * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
|
---|
29 | language that kind of looks like XML.
|
---|
30 |
|
---|
31 | * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
|
---|
32 | or invalid. This class has web browser-like heuristics for
|
---|
33 | obtaining a sensible parse tree in the face of common HTML errors.
|
---|
34 |
|
---|
35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
|
---|
36 | the encoding of an HTML or XML document, and converting it to
|
---|
37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
|
---|
38 |
|
---|
39 | For more than you ever wanted to know about Beautiful Soup, see the
|
---|
40 | documentation:
|
---|
41 | http://www.crummy.com/software/BeautifulSoup/documentation.html
|
---|
42 |
|
---|
43 | Here, have some legalese:
|
---|
44 |
|
---|
45 | Copyright (c) 2004-2007, Leonard Richardson
|
---|
46 |
|
---|
47 | All rights reserved.
|
---|
48 |
|
---|
49 | Redistribution and use in source and binary forms, with or without
|
---|
50 | modification, are permitted provided that the following conditions are
|
---|
51 | met:
|
---|
52 |
|
---|
53 | * Redistributions of source code must retain the above copyright
|
---|
54 | notice, this list of conditions and the following disclaimer.
|
---|
55 |
|
---|
56 | * Redistributions in binary form must reproduce the above
|
---|
57 | copyright notice, this list of conditions and the following
|
---|
58 | disclaimer in the documentation and/or other materials provided
|
---|
59 | with the distribution.
|
---|
60 |
|
---|
61 | * Neither the name of the the Beautiful Soup Consortium and All
|
---|
62 | Night Kosher Bakery nor the names of its contributors may be
|
---|
63 | used to endorse or promote products derived from this software
|
---|
64 | without specific prior written permission.
|
---|
65 |
|
---|
66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
---|
67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
---|
68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
---|
69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
---|
70 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
---|
71 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
---|
72 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
---|
73 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
---|
74 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
---|
75 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
---|
76 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
|
---|
77 |
|
---|
78 | """
|
---|
79 | from __future__ import generators
|
---|
80 |
|
---|
81 | __author__ = "Leonard Richardson ([email protected])"
|
---|
82 | __version__ = "3.0.5"
|
---|
83 | __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
|
---|
84 | __license__ = "New-style BSD"
|
---|
85 |
|
---|
86 | from sgmllib import SGMLParser, SGMLParseError
|
---|
87 | import codecs
|
---|
88 | import types
|
---|
89 | import re
|
---|
90 | import sgmllib
|
---|
91 | try:
|
---|
92 | from htmlentitydefs import name2codepoint
|
---|
93 | except ImportError:
|
---|
94 | name2codepoint = {}
|
---|
95 |
|
---|
96 | #This hack makes Beautiful Soup able to parse XML with namespaces
|
---|
97 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
|
---|
98 |
|
---|
99 | DEFAULT_OUTPUT_ENCODING = "utf-8"
|
---|
100 |
|
---|
101 | # First, the classes that represent markup elements.
|
---|
102 |
|
---|
103 | class PageElement:
|
---|
104 | """Contains the navigational information for some part of the page
|
---|
105 | (either a tag or a piece of text)"""
|
---|
106 |
|
---|
107 | def setup(self, parent=None, previous=None):
|
---|
108 | """Sets up the initial relations between this element and
|
---|
109 | other elements."""
|
---|
110 | self.parent = parent
|
---|
111 | self.previous = previous
|
---|
112 | self.next = None
|
---|
113 | self.previousSibling = None
|
---|
114 | self.nextSibling = None
|
---|
115 | if self.parent and self.parent.contents:
|
---|
116 | self.previousSibling = self.parent.contents[-1]
|
---|
117 | self.previousSibling.nextSibling = self
|
---|
118 |
|
---|
119 | def replaceWith(self, replaceWith):
|
---|
120 | oldParent = self.parent
|
---|
121 | myIndex = self.parent.contents.index(self)
|
---|
122 | if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
|
---|
123 | # We're replacing this element with one of its siblings.
|
---|
124 | index = self.parent.contents.index(replaceWith)
|
---|
125 | if index and index < myIndex:
|
---|
126 | # Furthermore, it comes before this element. That
|
---|
127 | # means that when we extract it, the index of this
|
---|
128 | # element will change.
|
---|
129 | myIndex = myIndex - 1
|
---|
130 | self.extract()
|
---|
131 | oldParent.insert(myIndex, replaceWith)
|
---|
132 |
|
---|
133 | def extract(self):
|
---|
134 | """Destructively rips this element out of the tree."""
|
---|
135 | if self.parent:
|
---|
136 | try:
|
---|
137 | self.parent.contents.remove(self)
|
---|
138 | except ValueError:
|
---|
139 | pass
|
---|
140 |
|
---|
141 | #Find the two elements that would be next to each other if
|
---|
142 | #this element (and any children) hadn't been parsed. Connect
|
---|
143 | #the two.
|
---|
144 | lastChild = self._lastRecursiveChild()
|
---|
145 | nextElement = lastChild.next
|
---|
146 |
|
---|
147 | if self.previous:
|
---|
148 | self.previous.next = nextElement
|
---|
149 | if nextElement:
|
---|
150 | nextElement.previous = self.previous
|
---|
151 | self.previous = None
|
---|
152 | lastChild.next = None
|
---|
153 |
|
---|
154 | self.parent = None
|
---|
155 | if self.previousSibling:
|
---|
156 | self.previousSibling.nextSibling = self.nextSibling
|
---|
157 | if self.nextSibling:
|
---|
158 | self.nextSibling.previousSibling = self.previousSibling
|
---|
159 | self.previousSibling = self.nextSibling = None
|
---|
160 |
|
---|
161 | def _lastRecursiveChild(self):
|
---|
162 | "Finds the last element beneath this object to be parsed."
|
---|
163 | lastChild = self
|
---|
164 | while hasattr(lastChild, 'contents') and lastChild.contents:
|
---|
165 | lastChild = lastChild.contents[-1]
|
---|
166 | return lastChild
|
---|
167 |
|
---|
168 | def insert(self, position, newChild):
|
---|
169 | if (isinstance(newChild, basestring)
|
---|
170 | or isinstance(newChild, unicode)) \
|
---|
171 | and not isinstance(newChild, NavigableString):
|
---|
172 | newChild = NavigableString(newChild)
|
---|
173 |
|
---|
174 | position = min(position, len(self.contents))
|
---|
175 | if hasattr(newChild, 'parent') and newChild.parent != None:
|
---|
176 | # We're 'inserting' an element that's already one
|
---|
177 | # of this object's children.
|
---|
178 | if newChild.parent == self:
|
---|
179 | index = self.find(newChild)
|
---|
180 | if index and index < position:
|
---|
181 | # Furthermore we're moving it further down the
|
---|
182 | # list of this object's children. That means that
|
---|
183 | # when we extract this element, our target index
|
---|
184 | # will jump down one.
|
---|
185 | position = position - 1
|
---|
186 | newChild.extract()
|
---|
187 |
|
---|
188 | newChild.parent = self
|
---|
189 | previousChild = None
|
---|
190 | if position == 0:
|
---|
191 | newChild.previousSibling = None
|
---|
192 | newChild.previous = self
|
---|
193 | else:
|
---|
194 | previousChild = self.contents[position-1]
|
---|
195 | newChild.previousSibling = previousChild
|
---|
196 | newChild.previousSibling.nextSibling = newChild
|
---|
197 | newChild.previous = previousChild._lastRecursiveChild()
|
---|
198 | if newChild.previous:
|
---|
199 | newChild.previous.next = newChild
|
---|
200 |
|
---|
201 | newChildsLastElement = newChild._lastRecursiveChild()
|
---|
202 |
|
---|
203 | if position >= len(self.contents):
|
---|
204 | newChild.nextSibling = None
|
---|
205 |
|
---|
206 | parent = self
|
---|
207 | parentsNextSibling = None
|
---|
208 | while not parentsNextSibling:
|
---|
209 | parentsNextSibling = parent.nextSibling
|
---|
210 | parent = parent.parent
|
---|
211 | if not parent: # This is the last element in the document.
|
---|
212 | break
|
---|
213 | if parentsNextSibling:
|
---|
214 | newChildsLastElement.next = parentsNextSibling
|
---|
215 | else:
|
---|
216 | newChildsLastElement.next = None
|
---|
217 | else:
|
---|
218 | nextChild = self.contents[position]
|
---|
219 | newChild.nextSibling = nextChild
|
---|
220 | if newChild.nextSibling:
|
---|
221 | newChild.nextSibling.previousSibling = newChild
|
---|
222 | newChildsLastElement.next = nextChild
|
---|
223 |
|
---|
224 | if newChildsLastElement.next:
|
---|
225 | newChildsLastElement.next.previous = newChildsLastElement
|
---|
226 | self.contents.insert(position, newChild)
|
---|
227 |
|
---|
228 | def append(self, tag):
|
---|
229 | """Appends the given tag to the contents of this tag."""
|
---|
230 | self.insert(len(self.contents), tag)
|
---|
231 |
|
---|
232 | def findNext(self, name=None, attrs={}, text=None, **kwargs):
|
---|
233 | """Returns the first item that matches the given criteria and
|
---|
234 | appears after this Tag in the document."""
|
---|
235 | return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
|
---|
236 |
|
---|
237 | def findAllNext(self, name=None, attrs={}, text=None, limit=None,
|
---|
238 | **kwargs):
|
---|
239 | """Returns all items that match the given criteria and appear
|
---|
240 | before after Tag in the document."""
|
---|
241 | return self._findAll(name, attrs, text, limit, self.nextGenerator)
|
---|
242 |
|
---|
243 | def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
|
---|
244 | """Returns the closest sibling to this Tag that matches the
|
---|
245 | given criteria and appears after this Tag in the document."""
|
---|
246 | return self._findOne(self.findNextSiblings, name, attrs, text,
|
---|
247 | **kwargs)
|
---|
248 |
|
---|
249 | def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
|
---|
250 | **kwargs):
|
---|
251 | """Returns the siblings of this Tag that match the given
|
---|
252 | criteria and appear after this Tag in the document."""
|
---|
253 | return self._findAll(name, attrs, text, limit,
|
---|
254 | self.nextSiblingGenerator, **kwargs)
|
---|
255 | fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
|
---|
256 |
|
---|
257 | def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
|
---|
258 | """Returns the first item that matches the given criteria and
|
---|
259 | appears before this Tag in the document."""
|
---|
260 | return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
|
---|
261 |
|
---|
262 | def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
|
---|
263 | **kwargs):
|
---|
264 | """Returns all items that match the given criteria and appear
|
---|
265 | before this Tag in the document."""
|
---|
266 | return self._findAll(name, attrs, text, limit, self.previousGenerator,
|
---|
267 | **kwargs)
|
---|
268 | fetchPrevious = findAllPrevious # Compatibility with pre-3.x
|
---|
269 |
|
---|
270 | def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
|
---|
271 | """Returns the closest sibling to this Tag that matches the
|
---|
272 | given criteria and appears before this Tag in the document."""
|
---|
273 | return self._findOne(self.findPreviousSiblings, name, attrs, text,
|
---|
274 | **kwargs)
|
---|
275 |
|
---|
276 | def findPreviousSiblings(self, name=None, attrs={}, text=None,
|
---|
277 | limit=None, **kwargs):
|
---|
278 | """Returns the siblings of this Tag that match the given
|
---|
279 | criteria and appear before this Tag in the document."""
|
---|
280 | return self._findAll(name, attrs, text, limit,
|
---|
281 | self.previousSiblingGenerator, **kwargs)
|
---|
282 | fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
|
---|
283 |
|
---|
284 | def findParent(self, name=None, attrs={}, **kwargs):
|
---|
285 | """Returns the closest parent of this Tag that matches the given
|
---|
286 | criteria."""
|
---|
287 | # NOTE: We can't use _findOne because findParents takes a different
|
---|
288 | # set of arguments.
|
---|
289 | r = None
|
---|
290 | l = self.findParents(name, attrs, 1)
|
---|
291 | if l:
|
---|
292 | r = l[0]
|
---|
293 | return r
|
---|
294 |
|
---|
295 | def findParents(self, name=None, attrs={}, limit=None, **kwargs):
|
---|
296 | """Returns the parents of this Tag that match the given
|
---|
297 | criteria."""
|
---|
298 |
|
---|
299 | return self._findAll(name, attrs, None, limit, self.parentGenerator,
|
---|
300 | **kwargs)
|
---|
301 | fetchParents = findParents # Compatibility with pre-3.x
|
---|
302 |
|
---|
303 | #These methods do the real heavy lifting.
|
---|
304 |
|
---|
305 | def _findOne(self, method, name, attrs, text, **kwargs):
|
---|
306 | r = None
|
---|
307 | l = method(name, attrs, text, 1, **kwargs)
|
---|
308 | if l:
|
---|
309 | r = l[0]
|
---|
310 | return r
|
---|
311 |
|
---|
312 | def _findAll(self, name, attrs, text, limit, generator, **kwargs):
|
---|
313 | "Iterates over a generator looking for things that match."
|
---|
314 |
|
---|
315 | if isinstance(name, SoupStrainer):
|
---|
316 | strainer = name
|
---|
317 | else:
|
---|
318 | # Build a SoupStrainer
|
---|
319 | strainer = SoupStrainer(name, attrs, text, **kwargs)
|
---|
320 | results = ResultSet(strainer)
|
---|
321 | g = generator()
|
---|
322 | while True:
|
---|
323 | try:
|
---|
324 | i = g.next()
|
---|
325 | except StopIteration:
|
---|
326 | break
|
---|
327 | if i:
|
---|
328 | found = strainer.search(i)
|
---|
329 | if found:
|
---|
330 | results.append(found)
|
---|
331 | if limit and len(results) >= limit:
|
---|
332 | break
|
---|
333 | return results
|
---|
334 |
|
---|
335 | #These Generators can be used to navigate starting from both
|
---|
336 | #NavigableStrings and Tags.
|
---|
337 | def nextGenerator(self):
|
---|
338 | i = self
|
---|
339 | while i:
|
---|
340 | i = i.next
|
---|
341 | yield i
|
---|
342 |
|
---|
343 | def nextSiblingGenerator(self):
|
---|
344 | i = self
|
---|
345 | while i:
|
---|
346 | i = i.nextSibling
|
---|
347 | yield i
|
---|
348 |
|
---|
349 | def previousGenerator(self):
|
---|
350 | i = self
|
---|
351 | while i:
|
---|
352 | i = i.previous
|
---|
353 | yield i
|
---|
354 |
|
---|
355 | def previousSiblingGenerator(self):
|
---|
356 | i = self
|
---|
357 | while i:
|
---|
358 | i = i.previousSibling
|
---|
359 | yield i
|
---|
360 |
|
---|
361 | def parentGenerator(self):
|
---|
362 | i = self
|
---|
363 | while i:
|
---|
364 | i = i.parent
|
---|
365 | yield i
|
---|
366 |
|
---|
367 | # Utility methods
|
---|
368 | def substituteEncoding(self, str, encoding=None):
|
---|
369 | encoding = encoding or "utf-8"
|
---|
370 | return str.replace("%SOUP-ENCODING%", encoding)
|
---|
371 |
|
---|
372 | def toEncoding(self, s, encoding=None):
|
---|
373 | """Encodes an object to a string in some encoding, or to Unicode.
|
---|
374 | ."""
|
---|
375 | if isinstance(s, unicode):
|
---|
376 | if encoding:
|
---|
377 | s = s.encode(encoding)
|
---|
378 | elif isinstance(s, str):
|
---|
379 | if encoding:
|
---|
380 | s = s.encode(encoding)
|
---|
381 | else:
|
---|
382 | s = unicode(s)
|
---|
383 | else:
|
---|
384 | if encoding:
|
---|
385 | s = self.toEncoding(str(s), encoding)
|
---|
386 | else:
|
---|
387 | s = unicode(s)
|
---|
388 | return s
|
---|
389 |
|
---|
390 | class NavigableString(unicode, PageElement):
|
---|
391 |
|
---|
392 | def __getnewargs__(self):
|
---|
393 | return (NavigableString.__str__(self),)
|
---|
394 |
|
---|
395 | def __getattr__(self, attr):
|
---|
396 | """text.string gives you text. This is for backwards
|
---|
397 | compatibility for Navigable*String, but for CData* it lets you
|
---|
398 | get the string without the CData wrapper."""
|
---|
399 | if attr == 'string':
|
---|
400 | return self
|
---|
401 | else:
|
---|
402 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
|
---|
403 |
|
---|
404 | def __unicode__(self):
|
---|
405 | return unicode(str(self))
|
---|
406 |
|
---|
407 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
---|
408 | if encoding:
|
---|
409 | return self.encode(encoding)
|
---|
410 | else:
|
---|
411 | return self
|
---|
412 |
|
---|
413 | class CData(NavigableString):
|
---|
414 |
|
---|
415 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
---|
416 | return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
|
---|
417 |
|
---|
418 | class ProcessingInstruction(NavigableString):
|
---|
419 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
---|
420 | output = self
|
---|
421 | if "%SOUP-ENCODING%" in output:
|
---|
422 | output = self.substituteEncoding(output, encoding)
|
---|
423 | return "<?%s?>" % self.toEncoding(output, encoding)
|
---|
424 |
|
---|
425 | class Comment(NavigableString):
|
---|
426 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
---|
427 | return "<!--%s-->" % NavigableString.__str__(self, encoding)
|
---|
428 |
|
---|
429 | class Declaration(NavigableString):
|
---|
430 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
---|
431 | return "<!%s>" % NavigableString.__str__(self, encoding)
|
---|
432 |
|
---|
433 | class Tag(PageElement):
|
---|
434 |
|
---|
435 | """Represents a found HTML tag with its attributes and contents."""
|
---|
436 |
|
---|
437 | def _invert(h):
|
---|
438 | "Cheap function to invert a hash."
|
---|
439 | i = {}
|
---|
440 | for k,v in h.items():
|
---|
441 | i[v] = k
|
---|
442 | return i
|
---|
443 |
|
---|
444 | XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
|
---|
445 | "quot" : '"',
|
---|
446 | "amp" : "&",
|
---|
447 | "lt" : "<",
|
---|
448 | "gt" : ">" }
|
---|
449 |
|
---|
450 | XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
|
---|
451 |
|
---|
452 | def _convertEntities(self, match):
|
---|
453 | """Used in a call to re.sub to replace HTML, XML, and numeric
|
---|
454 | entities with the appropriate Unicode characters. If HTML
|
---|
455 | entities are being converted, any unrecognized entities are
|
---|
456 | escaped."""
|
---|
457 | x = match.group(1)
|
---|
458 | if self.convertHTMLEntities and x in name2codepoint:
|
---|
459 | return unichr(name2codepoint[x])
|
---|
460 | elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
|
---|
461 | if self.convertXMLEntities:
|
---|
462 | return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
|
---|
463 | else:
|
---|
464 | return u'&%s;' % x
|
---|
465 | elif len(x) > 0 and x[0] == '#':
|
---|
466 | # Handle numeric entities
|
---|
467 | if len(x) > 1 and x[1] == 'x':
|
---|
468 | return unichr(int(x[2:], 16))
|
---|
469 | else:
|
---|
470 | return unichr(int(x[1:]))
|
---|
471 |
|
---|
472 | elif self.escapeUnrecognizedEntities:
|
---|
473 | return u'&%s;' % x
|
---|
474 | else:
|
---|
475 | return u'&%s;' % x
|
---|
476 |
|
---|
477 | def __init__(self, parser, name, attrs=None, parent=None,
|
---|
478 | previous=None):
|
---|
479 | "Basic constructor."
|
---|
480 |
|
---|
481 | # We don't actually store the parser object: that lets extracted
|
---|
482 | # chunks be garbage-collected
|
---|
483 | self.parserClass = parser.__class__
|
---|
484 | self.isSelfClosing = parser.isSelfClosingTag(name)
|
---|
485 | self.name = name
|
---|
486 | if attrs == None:
|
---|
487 | attrs = []
|
---|
488 | self.attrs = attrs
|
---|
489 | self.contents = []
|
---|
490 | self.setup(parent, previous)
|
---|
491 | self.hidden = False
|
---|
492 | self.containsSubstitutions = False
|
---|
493 | self.convertHTMLEntities = parser.convertHTMLEntities
|
---|
494 | self.convertXMLEntities = parser.convertXMLEntities
|
---|
495 | self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
|
---|
496 |
|
---|
497 | # Convert any HTML, XML, or numeric entities in the attribute values.
|
---|
498 | convert = lambda(k, val): (k,
|
---|
499 | re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
|
---|
500 | self._convertEntities,
|
---|
501 | val))
|
---|
502 | self.attrs = map(convert, self.attrs)
|
---|
503 |
|
---|
504 | def get(self, key, default=None):
|
---|
505 | """Returns the value of the 'key' attribute for the tag, or
|
---|
506 | the value given for 'default' if it doesn't have that
|
---|
507 | attribute."""
|
---|
508 | return self._getAttrMap().get(key, default)
|
---|
509 |
|
---|
510 | def has_key(self, key):
|
---|
511 | return self._getAttrMap().has_key(key)
|
---|
512 |
|
---|
513 | def __getitem__(self, key):
|
---|
514 | """tag[key] returns the value of the 'key' attribute for the tag,
|
---|
515 | and throws an exception if it's not there."""
|
---|
516 | return self._getAttrMap()[key]
|
---|
517 |
|
---|
518 | def __iter__(self):
|
---|
519 | "Iterating over a tag iterates over its contents."
|
---|
520 | return iter(self.contents)
|
---|
521 |
|
---|
522 | def __len__(self):
|
---|
523 | "The length of a tag is the length of its list of contents."
|
---|
524 | return len(self.contents)
|
---|
525 |
|
---|
526 | def __contains__(self, x):
|
---|
527 | return x in self.contents
|
---|
528 |
|
---|
529 | def __nonzero__(self):
|
---|
530 | "A tag is non-None even if it has no contents."
|
---|
531 | return True
|
---|
532 |
|
---|
533 | def __setitem__(self, key, value):
|
---|
534 | """Setting tag[key] sets the value of the 'key' attribute for the
|
---|
535 | tag."""
|
---|
536 | self._getAttrMap()
|
---|
537 | self.attrMap[key] = value
|
---|
538 | found = False
|
---|
539 | for i in range(0, len(self.attrs)):
|
---|
540 | if self.attrs[i][0] == key:
|
---|
541 | self.attrs[i] = (key, value)
|
---|
542 | found = True
|
---|
543 | if not found:
|
---|
544 | self.attrs.append((key, value))
|
---|
545 | self._getAttrMap()[key] = value
|
---|
546 |
|
---|
547 | def __delitem__(self, key):
|
---|
548 | "Deleting tag[key] deletes all 'key' attributes for the tag."
|
---|
549 | for item in self.attrs:
|
---|
550 | if item[0] == key:
|
---|
551 | self.attrs.remove(item)
|
---|
552 | #We don't break because bad HTML can define the same
|
---|
553 | #attribute multiple times.
|
---|
554 | self._getAttrMap()
|
---|
555 | if self.attrMap.has_key(key):
|
---|
556 | del self.attrMap[key]
|
---|
557 |
|
---|
558 | def __call__(self, *args, **kwargs):
|
---|
559 | """Calling a tag like a function is the same as calling its
|
---|
560 | findAll() method. Eg. tag('a') returns a list of all the A tags
|
---|
561 | found within this tag."""
|
---|
562 | return apply(self.findAll, args, kwargs)
|
---|
563 |
|
---|
564 | def __getattr__(self, tag):
|
---|
565 | #print "Getattr %s.%s" % (self.__class__, tag)
|
---|
566 | if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
|
---|
567 | return self.find(tag[:-3])
|
---|
568 | elif tag.find('__') != 0:
|
---|
569 | return self.find(tag)
|
---|
570 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
|
---|
571 |
|
---|
572 | def __eq__(self, other):
|
---|
573 | """Returns true iff this tag has the same name, the same attributes,
|
---|
574 | and the same contents (recursively) as the given tag.
|
---|
575 |
|
---|
576 | NOTE: right now this will return false if two tags have the
|
---|
577 | same attributes in a different order. Should this be fixed?"""
|
---|
578 | if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
|
---|
579 | return False
|
---|
580 | for i in range(0, len(self.contents)):
|
---|
581 | if self.contents[i] != other.contents[i]:
|
---|
582 | return False
|
---|
583 | return True
|
---|
584 |
|
---|
585 | def __ne__(self, other):
|
---|
586 | """Returns true iff this tag is not identical to the other tag,
|
---|
587 | as defined in __eq__."""
|
---|
588 | return not self == other
|
---|
589 |
|
---|
590 | def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
---|
591 | """Renders this tag as a string."""
|
---|
592 | return self.__str__(encoding)
|
---|
593 |
|
---|
594 | def __unicode__(self):
|
---|
595 | return self.__str__(None)
|
---|
596 |
|
---|
597 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
---|
598 | + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
---|
599 | + ")")
|
---|
600 |
|
---|
601 | def _sub_entity(self, x):
|
---|
602 | """Used with a regular expression to substitute the
|
---|
603 | appropriate XML entity for an XML special character."""
|
---|
604 | return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
|
---|
605 |
|
---|
606 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
|
---|
607 | prettyPrint=False, indentLevel=0):
|
---|
608 | """Returns a string or Unicode representation of this tag and
|
---|
609 | its contents. To get Unicode, pass None for encoding.
|
---|
610 |
|
---|
611 | NOTE: since Python's HTML parser consumes whitespace, this
|
---|
612 | method is not certain to reproduce the whitespace present in
|
---|
613 | the original string."""
|
---|
614 |
|
---|
615 | encodedName = self.toEncoding(self.name, encoding)
|
---|
616 |
|
---|
617 | attrs = []
|
---|
618 | if self.attrs:
|
---|
619 | for key, val in self.attrs:
|
---|
620 | fmt = '%s="%s"'
|
---|
621 | if isString(val):
|
---|
622 | if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
|
---|
623 | val = self.substituteEncoding(val, encoding)
|
---|
624 |
|
---|
625 | # The attribute value either:
|
---|
626 | #
|
---|
627 | # * Contains no embedded double quotes or single quotes.
|
---|
628 | # No problem: we enclose it in double quotes.
|
---|
629 | # * Contains embedded single quotes. No problem:
|
---|
630 | # double quotes work here too.
|
---|
631 | # * Contains embedded double quotes. No problem:
|
---|
632 | # we enclose it in single quotes.
|
---|
633 | # * Embeds both single _and_ double quotes. This
|
---|
634 | # can't happen naturally, but it can happen if
|
---|
635 | # you modify an attribute value after parsing
|
---|
636 | # the document. Now we have a bit of a
|
---|
637 | # problem. We solve it by enclosing the
|
---|
638 | # attribute in single quotes, and escaping any
|
---|
639 | # embedded single quotes to XML entities.
|
---|
640 | if '"' in val:
|
---|
641 | fmt = "%s='%s'"
|
---|
642 | if "'" in val:
|
---|
643 | # TODO: replace with apos when
|
---|
644 | # appropriate.
|
---|
645 | val = val.replace("'", "&squot;")
|
---|
646 |
|
---|
647 | # Now we're okay w/r/t quotes. But the attribute
|
---|
648 | # value might also contain angle brackets, or
|
---|
649 | # ampersands that aren't part of entities. We need
|
---|
650 | # to escape those to XML entities too.
|
---|
651 | val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
|
---|
652 |
|
---|
653 | attrs.append(fmt % (self.toEncoding(key, encoding),
|
---|
654 | self.toEncoding(val, encoding)))
|
---|
655 | close = ''
|
---|
656 | closeTag = ''
|
---|
657 | if self.isSelfClosing:
|
---|
658 | close = ' /'
|
---|
659 | else:
|
---|
660 | closeTag = '</%s>' % encodedName
|
---|
661 |
|
---|
662 | indentTag, indentContents = 0, 0
|
---|
663 | if prettyPrint:
|
---|
664 | indentTag = indentLevel
|
---|
665 | space = (' ' * (indentTag-1))
|
---|
666 | indentContents = indentTag + 1
|
---|
667 | contents = self.renderContents(encoding, prettyPrint, indentContents)
|
---|
668 | if self.hidden:
|
---|
669 | s = contents
|
---|
670 | else:
|
---|
671 | s = []
|
---|
672 | attributeString = ''
|
---|
673 | if attrs:
|
---|
674 | attributeString = ' ' + ' '.join(attrs)
|
---|
675 | if prettyPrint:
|
---|
676 | s.append(space)
|
---|
677 | s.append('<%s%s%s>' % (encodedName, attributeString, close))
|
---|
678 | if prettyPrint:
|
---|
679 | s.append("\n")
|
---|
680 | s.append(contents)
|
---|
681 | if prettyPrint and contents and contents[-1] != "\n":
|
---|
682 | s.append("\n")
|
---|
683 | if prettyPrint and closeTag:
|
---|
684 | s.append(space)
|
---|
685 | s.append(closeTag)
|
---|
686 | if prettyPrint and closeTag and self.nextSibling:
|
---|
687 | s.append("\n")
|
---|
688 | s = ''.join(s)
|
---|
689 | return s
|
---|
690 |
|
---|
691 | def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
|
---|
692 | return self.__str__(encoding, True)
|
---|
693 |
|
---|
694 | def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
|
---|
695 | prettyPrint=False, indentLevel=0):
|
---|
696 | """Renders the contents of this tag as a string in the given
|
---|
697 | encoding. If encoding is None, returns a Unicode string.."""
|
---|
698 | s=[]
|
---|
699 | for c in self:
|
---|
700 | text = None
|
---|
701 | if isinstance(c, NavigableString):
|
---|
702 | text = c.__str__(encoding)
|
---|
703 | elif isinstance(c, Tag):
|
---|
704 | s.append(c.__str__(encoding, prettyPrint, indentLevel))
|
---|
705 | if text and prettyPrint:
|
---|
706 | text = text.strip()
|
---|
707 | if text:
|
---|
708 | if prettyPrint:
|
---|
709 | s.append(" " * (indentLevel-1))
|
---|
710 | s.append(text)
|
---|
711 | if prettyPrint:
|
---|
712 | s.append("\n")
|
---|
713 | return ''.join(s)
|
---|
714 |
|
---|
715 | #Soup methods
|
---|
716 |
|
---|
717 | def find(self, name=None, attrs={}, recursive=True, text=None,
|
---|
718 | **kwargs):
|
---|
719 | """Return only the first child of this Tag matching the given
|
---|
720 | criteria."""
|
---|
721 | r = None
|
---|
722 | l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
|
---|
723 | if l:
|
---|
724 | r = l[0]
|
---|
725 | return r
|
---|
726 | findChild = find
|
---|
727 |
|
---|
728 | def findAll(self, name=None, attrs={}, recursive=True, text=None,
|
---|
729 | limit=None, **kwargs):
|
---|
730 | """Extracts a list of Tag objects that match the given
|
---|
731 | criteria. You can specify the name of the Tag and any
|
---|
732 | attributes you want the Tag to have.
|
---|
733 |
|
---|
734 | The value of a key-value pair in the 'attrs' map can be a
|
---|
735 | string, a list of strings, a regular expression object, or a
|
---|
736 | callable that takes a string and returns whether or not the
|
---|
737 | string matches for some custom definition of 'matches'. The
|
---|
738 | same is true of the tag name."""
|
---|
739 | generator = self.recursiveChildGenerator
|
---|
740 | if not recursive:
|
---|
741 | generator = self.childGenerator
|
---|
742 | return self._findAll(name, attrs, text, limit, generator, **kwargs)
|
---|
743 | findChildren = findAll
|
---|
744 |
|
---|
745 | # Pre-3.x compatibility methods
|
---|
746 | first = find
|
---|
747 | fetch = findAll
|
---|
748 |
|
---|
749 | def fetchText(self, text=None, recursive=True, limit=None):
|
---|
750 | return self.findAll(text=text, recursive=recursive, limit=limit)
|
---|
751 |
|
---|
752 | def firstText(self, text=None, recursive=True):
|
---|
753 | return self.find(text=text, recursive=recursive)
|
---|
754 |
|
---|
755 | #Private methods
|
---|
756 |
|
---|
757 | def _getAttrMap(self):
|
---|
758 | """Initializes a map representation of this tag's attributes,
|
---|
759 | if not already initialized."""
|
---|
760 | if not getattr(self, 'attrMap'):
|
---|
761 | self.attrMap = {}
|
---|
762 | for (key, value) in self.attrs:
|
---|
763 | self.attrMap[key] = value
|
---|
764 | return self.attrMap
|
---|
765 |
|
---|
766 | #Generator methods
|
---|
767 | def childGenerator(self):
|
---|
768 | for i in range(0, len(self.contents)):
|
---|
769 | yield self.contents[i]
|
---|
770 | raise StopIteration
|
---|
771 |
|
---|
772 | def recursiveChildGenerator(self):
|
---|
773 | stack = [(self, 0)]
|
---|
774 | while stack:
|
---|
775 | tag, start = stack.pop()
|
---|
776 | if isinstance(tag, Tag):
|
---|
777 | for i in range(start, len(tag.contents)):
|
---|
778 | a = tag.contents[i]
|
---|
779 | yield a
|
---|
780 | if isinstance(a, Tag) and tag.contents:
|
---|
781 | if i < len(tag.contents) - 1:
|
---|
782 | stack.append((tag, i+1))
|
---|
783 | stack.append((a, 0))
|
---|
784 | break
|
---|
785 | raise StopIteration
|
---|
786 |
|
---|
787 | # Next, a couple classes to represent queries and their results.
|
---|
788 | class SoupStrainer:
|
---|
789 | """Encapsulates a number of ways of matching a markup element (tag or
|
---|
790 | text)."""
|
---|
791 |
|
---|
792 | def __init__(self, name=None, attrs={}, text=None, **kwargs):
|
---|
793 | self.name = name
|
---|
794 | if isString(attrs):
|
---|
795 | kwargs['class'] = attrs
|
---|
796 | attrs = None
|
---|
797 | if kwargs:
|
---|
798 | if attrs:
|
---|
799 | attrs = attrs.copy()
|
---|
800 | attrs.update(kwargs)
|
---|
801 | else:
|
---|
802 | attrs = kwargs
|
---|
803 | self.attrs = attrs
|
---|
804 | self.text = text
|
---|
805 |
|
---|
806 | def __str__(self):
|
---|
807 | if self.text:
|
---|
808 | return self.text
|
---|
809 | else:
|
---|
810 | return "%s|%s" % (self.name, self.attrs)
|
---|
811 |
|
---|
812 | def searchTag(self, markupName=None, markupAttrs={}):
|
---|
813 | found = None
|
---|
814 | markup = None
|
---|
815 | if isinstance(markupName, Tag):
|
---|
816 | markup = markupName
|
---|
817 | markupAttrs = markup
|
---|
818 | callFunctionWithTagData = callable(self.name) \
|
---|
819 | and not isinstance(markupName, Tag)
|
---|
820 |
|
---|
821 | if (not self.name) \
|
---|
822 | or callFunctionWithTagData \
|
---|
823 | or (markup and self._matches(markup, self.name)) \
|
---|
824 | or (not markup and self._matches(markupName, self.name)):
|
---|
825 | if callFunctionWithTagData:
|
---|
826 | match = self.name(markupName, markupAttrs)
|
---|
827 | else:
|
---|
828 | match = True
|
---|
829 | markupAttrMap = None
|
---|
830 | for attr, matchAgainst in self.attrs.items():
|
---|
831 | if not markupAttrMap:
|
---|
832 | if hasattr(markupAttrs, 'get'):
|
---|
833 | markupAttrMap = markupAttrs
|
---|
834 | else:
|
---|
835 | markupAttrMap = {}
|
---|
836 | for k,v in markupAttrs:
|
---|
837 | markupAttrMap[k] = v
|
---|
838 | attrValue = markupAttrMap.get(attr)
|
---|
839 | if not self._matches(attrValue, matchAgainst):
|
---|
840 | match = False
|
---|
841 | break
|
---|
842 | if match:
|
---|
843 | if markup:
|
---|
844 | found = markup
|
---|
845 | else:
|
---|
846 | found = markupName
|
---|
847 | return found
|
---|
848 |
|
---|
849 | def search(self, markup):
|
---|
850 | #print 'looking for %s in %s' % (self, markup)
|
---|
851 | found = None
|
---|
852 | # If given a list of items, scan it for a text element that
|
---|
853 | # matches.
|
---|
854 | if isList(markup) and not isinstance(markup, Tag):
|
---|
855 | for element in markup:
|
---|
856 | if isinstance(element, NavigableString) \
|
---|
857 | and self.search(element):
|
---|
858 | found = element
|
---|
859 | break
|
---|
860 | # If it's a Tag, make sure its name or attributes match.
|
---|
861 | # Don't bother with Tags if we're searching for text.
|
---|
862 | elif isinstance(markup, Tag):
|
---|
863 | if not self.text:
|
---|
864 | found = self.searchTag(markup)
|
---|
865 | # If it's text, make sure the text matches.
|
---|
866 | elif isinstance(markup, NavigableString) or \
|
---|
867 | isString(markup):
|
---|
868 | if self._matches(markup, self.text):
|
---|
869 | found = markup
|
---|
870 | else:
|
---|
871 | raise Exception, "I don't know how to match against a %s" \
|
---|
872 | % markup.__class__
|
---|
873 | return found
|
---|
874 |
|
---|
875 | def _matches(self, markup, matchAgainst):
|
---|
876 | #print "Matching %s against %s" % (markup, matchAgainst)
|
---|
877 | result = False
|
---|
878 | if matchAgainst == True and type(matchAgainst) == types.BooleanType:
|
---|
879 | result = markup != None
|
---|
880 | elif callable(matchAgainst):
|
---|
881 | result = matchAgainst(markup)
|
---|
882 | else:
|
---|
883 | #Custom match methods take the tag as an argument, but all
|
---|
884 | #other ways of matching match the tag name as a string.
|
---|
885 | if isinstance(markup, Tag):
|
---|
886 | markup = markup.name
|
---|
887 | if markup and not isString(markup):
|
---|
888 | markup = unicode(markup)
|
---|
889 | #Now we know that chunk is either a string, or None.
|
---|
890 | if hasattr(matchAgainst, 'match'):
|
---|
891 | # It's a regexp object.
|
---|
892 | result = markup and matchAgainst.search(markup)
|
---|
893 | elif isList(matchAgainst):
|
---|
894 | result = markup in matchAgainst
|
---|
895 | elif hasattr(matchAgainst, 'items'):
|
---|
896 | result = markup.has_key(matchAgainst)
|
---|
897 | elif matchAgainst and isString(markup):
|
---|
898 | if isinstance(markup, unicode):
|
---|
899 | matchAgainst = unicode(matchAgainst)
|
---|
900 | else:
|
---|
901 | matchAgainst = str(matchAgainst)
|
---|
902 |
|
---|
903 | if not result:
|
---|
904 | result = matchAgainst == markup
|
---|
905 | return result
|
---|
906 |
|
---|
907 | class ResultSet(list):
|
---|
908 | """A ResultSet is just a list that keeps track of the SoupStrainer
|
---|
909 | that created it."""
|
---|
910 | def __init__(self, source):
|
---|
911 | list.__init__([])
|
---|
912 | self.source = source
|
---|
913 |
|
---|
914 | # Now, some helper functions.
|
---|
915 |
|
---|
916 | def isList(l):
|
---|
917 | """Convenience method that works with all 2.x versions of Python
|
---|
918 | to determine whether or not something is listlike."""
|
---|
919 | return hasattr(l, '__iter__') \
|
---|
920 | or (type(l) in (types.ListType, types.TupleType))
|
---|
921 |
|
---|
922 | def isString(s):
|
---|
923 | """Convenience method that works with all 2.x versions of Python
|
---|
924 | to determine whether or not something is stringlike."""
|
---|
925 | try:
|
---|
926 | return isinstance(s, unicode) or isinstance(s, basestring)
|
---|
927 | except NameError:
|
---|
928 | return isinstance(s, str)
|
---|
929 |
|
---|
930 | def buildTagMap(default, *args):
|
---|
931 | """Turns a list of maps, lists, or scalars into a single map.
|
---|
932 | Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
|
---|
933 | NESTING_RESET_TAGS maps out of lists and partial maps."""
|
---|
934 | built = {}
|
---|
935 | for portion in args:
|
---|
936 | if hasattr(portion, 'items'):
|
---|
937 | #It's a map. Merge it.
|
---|
938 | for k,v in portion.items():
|
---|
939 | built[k] = v
|
---|
940 | elif isList(portion):
|
---|
941 | #It's a list. Map each item to the default.
|
---|
942 | for k in portion:
|
---|
943 | built[k] = default
|
---|
944 | else:
|
---|
945 | #It's a scalar. Map it to the default.
|
---|
946 | built[portion] = default
|
---|
947 | return built
|
---|
948 |
|
---|
949 | # Now, the parser classes.
|
---|
950 |
|
---|
951 | class BeautifulStoneSoup(Tag, SGMLParser):
|
---|
952 |
|
---|
953 | """This class contains the basic parser and search code. It defines
|
---|
954 | a parser that knows nothing about tag behavior except for the
|
---|
955 | following:
|
---|
956 |
|
---|
957 | You can't close a tag without closing all the tags it encloses.
|
---|
958 | That is, "<foo><bar></foo>" actually means
|
---|
959 | "<foo><bar></bar></foo>".
|
---|
960 |
|
---|
961 | [Another possible explanation is "<foo><bar /></foo>", but since
|
---|
962 | this class defines no SELF_CLOSING_TAGS, it will never use that
|
---|
963 | explanation.]
|
---|
964 |
|
---|
965 | This class is useful for parsing XML or made-up markup languages,
|
---|
966 | or when BeautifulSoup makes an assumption counter to what you were
|
---|
967 | expecting."""
|
---|
968 |
|
---|
969 | SELF_CLOSING_TAGS = {}
|
---|
970 | NESTABLE_TAGS = {}
|
---|
971 | RESET_NESTING_TAGS = {}
|
---|
972 | QUOTE_TAGS = {}
|
---|
973 |
|
---|
974 | MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
|
---|
975 | lambda x: x.group(1) + ' />'),
|
---|
976 | (re.compile('<!\s+([^<>]*)>'),
|
---|
977 | lambda x: '<!' + x.group(1) + '>')
|
---|
978 | ]
|
---|
979 |
|
---|
980 | ROOT_TAG_NAME = u'[document]'
|
---|
981 |
|
---|
982 | HTML_ENTITIES = "html"
|
---|
983 | XML_ENTITIES = "xml"
|
---|
984 | XHTML_ENTITIES = "xhtml"
|
---|
985 | # TODO: This only exists for backwards-compatibility
|
---|
986 | ALL_ENTITIES = XHTML_ENTITIES
|
---|
987 |
|
---|
988 | # Used when determining whether a text node is all whitespace and
|
---|
989 | # can be replaced with a single space. A text node that contains
|
---|
990 | # fancy Unicode spaces (usually non-breaking) should be left
|
---|
991 | # alone.
|
---|
992 | STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
|
---|
993 |
|
---|
994 | def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
|
---|
995 | markupMassage=True, smartQuotesTo=XML_ENTITIES,
|
---|
996 | convertEntities=None, selfClosingTags=None):
|
---|
997 | """The Soup object is initialized as the 'root tag', and the
|
---|
998 | provided markup (which can be a string or a file-like object)
|
---|
999 | is fed into the underlying parser.
|
---|
1000 |
|
---|
1001 | sgmllib will process most bad HTML, and the BeautifulSoup
|
---|
1002 | class has some tricks for dealing with some HTML that kills
|
---|
1003 | sgmllib, but Beautiful Soup can nonetheless choke or lose data
|
---|
1004 | if your data uses self-closing tags or declarations
|
---|
1005 | incorrectly.
|
---|
1006 |
|
---|
1007 | By default, Beautiful Soup uses regexes to sanitize input,
|
---|
1008 | avoiding the vast majority of these problems. If the problems
|
---|
1009 | don't apply to you, pass in False for markupMassage, and
|
---|
1010 | you'll get better performance.
|
---|
1011 |
|
---|
1012 | The default parser massage techniques fix the two most common
|
---|
1013 | instances of invalid HTML that choke sgmllib:
|
---|
1014 |
|
---|
1015 | <br/> (No space between name of closing tag and tag close)
|
---|
1016 | <! --Comment--> (Extraneous whitespace in declaration)
|
---|
1017 |
|
---|
1018 | You can pass in a custom list of (RE object, replace method)
|
---|
1019 | tuples to get Beautiful Soup to scrub your input the way you
|
---|
1020 | want."""
|
---|
1021 |
|
---|
1022 | self.parseOnlyThese = parseOnlyThese
|
---|
1023 | self.fromEncoding = fromEncoding
|
---|
1024 | self.smartQuotesTo = smartQuotesTo
|
---|
1025 | self.convertEntities = convertEntities
|
---|
1026 | # Set the rules for how we'll deal with the entities we
|
---|
1027 | # encounter
|
---|
1028 | if self.convertEntities:
|
---|
1029 | # It doesn't make sense to convert encoded characters to
|
---|
1030 | # entities even while you're converting entities to Unicode.
|
---|
1031 | # Just convert it all to Unicode.
|
---|
1032 | self.smartQuotesTo = None
|
---|
1033 | if convertEntities == self.HTML_ENTITIES:
|
---|
1034 | self.convertXMLEntities = False
|
---|
1035 | self.convertHTMLEntities = True
|
---|
1036 | self.escapeUnrecognizedEntities = True
|
---|
1037 | elif convertEntities == self.XHTML_ENTITIES:
|
---|
1038 | self.convertXMLEntities = True
|
---|
1039 | self.convertHTMLEntities = True
|
---|
1040 | self.escapeUnrecognizedEntities = False
|
---|
1041 | elif convertEntities == self.XML_ENTITIES:
|
---|
1042 | self.convertXMLEntities = True
|
---|
1043 | self.convertHTMLEntities = False
|
---|
1044 | self.escapeUnrecognizedEntities = False
|
---|
1045 | else:
|
---|
1046 | self.convertXMLEntities = False
|
---|
1047 | self.convertHTMLEntities = False
|
---|
1048 | self.escapeUnrecognizedEntities = False
|
---|
1049 |
|
---|
1050 | self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
|
---|
1051 | SGMLParser.__init__(self)
|
---|
1052 |
|
---|
1053 | if hasattr(markup, 'read'): # It's a file-type object.
|
---|
1054 | markup = markup.read()
|
---|
1055 | self.markup = markup
|
---|
1056 | self.markupMassage = markupMassage
|
---|
1057 | try:
|
---|
1058 | self._feed()
|
---|
1059 | except StopParsing:
|
---|
1060 | pass
|
---|
1061 | self.markup = None # The markup can now be GCed
|
---|
1062 |
|
---|
1063 | def convert_charref(self, name):
|
---|
1064 | """This method fixes a bug in Python's SGMLParser."""
|
---|
1065 | try:
|
---|
1066 | n = int(name)
|
---|
1067 | except ValueError:
|
---|
1068 | return
|
---|
1069 | if not 0 <= n <= 127 : # ASCII ends at 127, not 255
|
---|
1070 | return
|
---|
1071 | return self.convert_codepoint(n)
|
---|
1072 |
|
---|
1073 | def _feed(self, inDocumentEncoding=None):
|
---|
1074 | # Convert the document to Unicode.
|
---|
1075 | markup = self.markup
|
---|
1076 | if isinstance(markup, unicode):
|
---|
1077 | if not hasattr(self, 'originalEncoding'):
|
---|
1078 | self.originalEncoding = None
|
---|
1079 | else:
|
---|
1080 | dammit = UnicodeDammit\
|
---|
1081 | (markup, [self.fromEncoding, inDocumentEncoding],
|
---|
1082 | smartQuotesTo=self.smartQuotesTo)
|
---|
1083 | markup = dammit.unicode
|
---|
1084 | self.originalEncoding = dammit.originalEncoding
|
---|
1085 | if markup:
|
---|
1086 | if self.markupMassage:
|
---|
1087 | if not isList(self.markupMassage):
|
---|
1088 | self.markupMassage = self.MARKUP_MASSAGE
|
---|
1089 | for fix, m in self.markupMassage:
|
---|
1090 | markup = fix.sub(m, markup)
|
---|
1091 | # TODO: We get rid of markupMassage so that the
|
---|
1092 | # soup object can be deepcopied later on. Some
|
---|
1093 | # Python installations can't copy regexes. If anyone
|
---|
1094 | # was relying on the existence of markupMassage, this
|
---|
1095 | # might cause problems.
|
---|
1096 | del(self.markupMassage)
|
---|
1097 | self.reset()
|
---|
1098 |
|
---|
1099 | SGMLParser.feed(self, markup)
|
---|
1100 | # Close out any unfinished strings and close all the open tags.
|
---|
1101 | self.endData()
|
---|
1102 | while self.currentTag.name != self.ROOT_TAG_NAME:
|
---|
1103 | self.popTag()
|
---|
1104 |
|
---|
1105 | def __getattr__(self, methodName):
|
---|
1106 | """This method routes method call requests to either the SGMLParser
|
---|
1107 | superclass or the Tag superclass, depending on the method name."""
|
---|
1108 | #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
|
---|
1109 |
|
---|
1110 | if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
|
---|
1111 | or methodName.find('do_') == 0:
|
---|
1112 | return SGMLParser.__getattr__(self, methodName)
|
---|
1113 | elif methodName.find('__') != 0:
|
---|
1114 | return Tag.__getattr__(self, methodName)
|
---|
1115 | else:
|
---|
1116 | raise AttributeError
|
---|
1117 |
|
---|
1118 | def isSelfClosingTag(self, name):
|
---|
1119 | """Returns true iff the given string is the name of a
|
---|
1120 | self-closing tag according to this parser."""
|
---|
1121 | return self.SELF_CLOSING_TAGS.has_key(name) \
|
---|
1122 | or self.instanceSelfClosingTags.has_key(name)
|
---|
1123 |
|
---|
1124 | def reset(self):
|
---|
1125 | Tag.__init__(self, self, self.ROOT_TAG_NAME)
|
---|
1126 | self.hidden = 1
|
---|
1127 | SGMLParser.reset(self)
|
---|
1128 | self.currentData = []
|
---|
1129 | self.currentTag = None
|
---|
1130 | self.tagStack = []
|
---|
1131 | self.quoteStack = []
|
---|
1132 | self.pushTag(self)
|
---|
1133 |
|
---|
1134 | def popTag(self):
|
---|
1135 | tag = self.tagStack.pop()
|
---|
1136 | # Tags with just one string-owning child get the child as a
|
---|
1137 | # 'string' property, so that soup.tag.string is shorthand for
|
---|
1138 | # soup.tag.contents[0]
|
---|
1139 | if len(self.currentTag.contents) == 1 and \
|
---|
1140 | isinstance(self.currentTag.contents[0], NavigableString):
|
---|
1141 | self.currentTag.string = self.currentTag.contents[0]
|
---|
1142 |
|
---|
1143 | #print "Pop", tag.name
|
---|
1144 | if self.tagStack:
|
---|
1145 | self.currentTag = self.tagStack[-1]
|
---|
1146 | return self.currentTag
|
---|
1147 |
|
---|
1148 | def pushTag(self, tag):
|
---|
1149 | #print "Push", tag.name
|
---|
1150 | if self.currentTag:
|
---|
1151 | self.currentTag.contents.append(tag)
|
---|
1152 | self.tagStack.append(tag)
|
---|
1153 | self.currentTag = self.tagStack[-1]
|
---|
1154 |
|
---|
1155 | def endData(self, containerClass=NavigableString):
|
---|
1156 | if self.currentData:
|
---|
1157 | currentData = ''.join(self.currentData)
|
---|
1158 | if not currentData.translate(self.STRIP_ASCII_SPACES):
|
---|
1159 | if '\n' in currentData:
|
---|
1160 | currentData = '\n'
|
---|
1161 | else:
|
---|
1162 | currentData = ' '
|
---|
1163 | self.currentData = []
|
---|
1164 | if self.parseOnlyThese and len(self.tagStack) <= 1 and \
|
---|
1165 | (not self.parseOnlyThese.text or \
|
---|
1166 | not self.parseOnlyThese.search(currentData)):
|
---|
1167 | return
|
---|
1168 | o = containerClass(currentData)
|
---|
1169 | o.setup(self.currentTag, self.previous)
|
---|
1170 | if self.previous:
|
---|
1171 | self.previous.next = o
|
---|
1172 | self.previous = o
|
---|
1173 | self.currentTag.contents.append(o)
|
---|
1174 |
|
---|
1175 |
|
---|
1176 | def _popToTag(self, name, inclusivePop=True):
|
---|
1177 | """Pops the tag stack up to and including the most recent
|
---|
1178 | instance of the given tag. If inclusivePop is false, pops the tag
|
---|
1179 | stack up to but *not* including the most recent instqance of
|
---|
1180 | the given tag."""
|
---|
1181 | #print "Popping to %s" % name
|
---|
1182 | if name == self.ROOT_TAG_NAME:
|
---|
1183 | return
|
---|
1184 |
|
---|
1185 | numPops = 0
|
---|
1186 | mostRecentTag = None
|
---|
1187 | for i in range(len(self.tagStack)-1, 0, -1):
|
---|
1188 | if name == self.tagStack[i].name:
|
---|
1189 | numPops = len(self.tagStack)-i
|
---|
1190 | break
|
---|
1191 | if not inclusivePop:
|
---|
1192 | numPops = numPops - 1
|
---|
1193 |
|
---|
1194 | for i in range(0, numPops):
|
---|
1195 | mostRecentTag = self.popTag()
|
---|
1196 | return mostRecentTag
|
---|
1197 |
|
---|
1198 | def _smartPop(self, name):
|
---|
1199 |
|
---|
1200 | """We need to pop up to the previous tag of this type, unless
|
---|
1201 | one of this tag's nesting reset triggers comes between this
|
---|
1202 | tag and the previous tag of this type, OR unless this tag is a
|
---|
1203 | generic nesting trigger and another generic nesting trigger
|
---|
1204 | comes between this tag and the previous tag of this type.
|
---|
1205 |
|
---|
1206 | Examples:
|
---|
1207 | <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
|
---|
1208 | <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
|
---|
1209 | <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
|
---|
1210 |
|
---|
1211 | <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
|
---|
1212 | <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
|
---|
1213 | <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
|
---|
1214 | """
|
---|
1215 |
|
---|
1216 | nestingResetTriggers = self.NESTABLE_TAGS.get(name)
|
---|
1217 | isNestable = nestingResetTriggers != None
|
---|
1218 | isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
|
---|
1219 | popTo = None
|
---|
1220 | inclusive = True
|
---|
1221 | for i in range(len(self.tagStack)-1, 0, -1):
|
---|
1222 | p = self.tagStack[i]
|
---|
1223 | if (not p or p.name == name) and not isNestable:
|
---|
1224 | #Non-nestable tags get popped to the top or to their
|
---|
1225 | #last occurance.
|
---|
1226 | popTo = name
|
---|
1227 | break
|
---|
1228 | if (nestingResetTriggers != None
|
---|
1229 | and p.name in nestingResetTriggers) \
|
---|
1230 | or (nestingResetTriggers == None and isResetNesting
|
---|
1231 | and self.RESET_NESTING_TAGS.has_key(p.name)):
|
---|
1232 |
|
---|
1233 | #If we encounter one of the nesting reset triggers
|
---|
1234 | #peculiar to this tag, or we encounter another tag
|
---|
1235 | #that causes nesting to reset, pop up to but not
|
---|
1236 | #including that tag.
|
---|
1237 | popTo = p.name
|
---|
1238 | inclusive = False
|
---|
1239 | break
|
---|
1240 | p = p.parent
|
---|
1241 | if popTo:
|
---|
1242 | self._popToTag(popTo, inclusive)
|
---|
1243 |
|
---|
1244 | def unknown_starttag(self, name, attrs, selfClosing=0):
|
---|
1245 | #print "Start tag %s: %s" % (name, attrs)
|
---|
1246 | if self.quoteStack:
|
---|
1247 | #This is not a real tag.
|
---|
1248 | #print "<%s> is not real!" % name
|
---|
1249 | attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
|
---|
1250 | self.handle_data('<%s%s>' % (name, attrs))
|
---|
1251 | return
|
---|
1252 | self.endData()
|
---|
1253 |
|
---|
1254 | if not self.isSelfClosingTag(name) and not selfClosing:
|
---|
1255 | self._smartPop(name)
|
---|
1256 |
|
---|
1257 | if self.parseOnlyThese and len(self.tagStack) <= 1 \
|
---|
1258 | and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
|
---|
1259 | return
|
---|
1260 |
|
---|
1261 | tag = Tag(self, name, attrs, self.currentTag, self.previous)
|
---|
1262 | if self.previous:
|
---|
1263 | self.previous.next = tag
|
---|
1264 | self.previous = tag
|
---|
1265 | self.pushTag(tag)
|
---|
1266 | if selfClosing or self.isSelfClosingTag(name):
|
---|
1267 | self.popTag()
|
---|
1268 | if name in self.QUOTE_TAGS:
|
---|
1269 | #print "Beginning quote (%s)" % name
|
---|
1270 | self.quoteStack.append(name)
|
---|
1271 | self.literal = 1
|
---|
1272 | return tag
|
---|
1273 |
|
---|
1274 | def unknown_endtag(self, name):
|
---|
1275 | #print "End tag %s" % name
|
---|
1276 | if self.quoteStack and self.quoteStack[-1] != name:
|
---|
1277 | #This is not a real end tag.
|
---|
1278 | #print "</%s> is not real!" % name
|
---|
1279 | self.handle_data('</%s>' % name)
|
---|
1280 | return
|
---|
1281 | self.endData()
|
---|
1282 | self._popToTag(name)
|
---|
1283 | if self.quoteStack and self.quoteStack[-1] == name:
|
---|
1284 | self.quoteStack.pop()
|
---|
1285 | self.literal = (len(self.quoteStack) > 0)
|
---|
1286 |
|
---|
1287 | def handle_data(self, data):
|
---|
1288 | self.currentData.append(data)
|
---|
1289 |
|
---|
1290 | def _toStringSubclass(self, text, subclass):
|
---|
1291 | """Adds a certain piece of text to the tree as a NavigableString
|
---|
1292 | subclass."""
|
---|
1293 | self.endData()
|
---|
1294 | self.handle_data(text)
|
---|
1295 | self.endData(subclass)
|
---|
1296 |
|
---|
1297 | def handle_pi(self, text):
|
---|
1298 | """Handle a processing instruction as a ProcessingInstruction
|
---|
1299 | object, possibly one with a %SOUP-ENCODING% slot into which an
|
---|
1300 | encoding will be plugged later."""
|
---|
1301 | if text[:3] == "xml":
|
---|
1302 | text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
|
---|
1303 | self._toStringSubclass(text, ProcessingInstruction)
|
---|
1304 |
|
---|
1305 | def handle_comment(self, text):
|
---|
1306 | "Handle comments as Comment objects."
|
---|
1307 | self._toStringSubclass(text, Comment)
|
---|
1308 |
|
---|
1309 | def handle_charref(self, ref):
|
---|
1310 | "Handle character references as data."
|
---|
1311 | if self.convertEntities:
|
---|
1312 | data = unichr(int(ref))
|
---|
1313 | else:
|
---|
1314 | data = '&#%s;' % ref
|
---|
1315 | self.handle_data(data)
|
---|
1316 |
|
---|
1317 | def handle_entityref(self, ref):
|
---|
1318 | """Handle entity references as data, possibly converting known
|
---|
1319 | HTML and/or XML entity references to the corresponding Unicode
|
---|
1320 | characters."""
|
---|
1321 | data = None
|
---|
1322 | if self.convertHTMLEntities:
|
---|
1323 | try:
|
---|
1324 | data = unichr(name2codepoint[ref])
|
---|
1325 | except KeyError:
|
---|
1326 | pass
|
---|
1327 |
|
---|
1328 | if not data and self.convertXMLEntities:
|
---|
1329 | data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
|
---|
1330 |
|
---|
1331 | if not data and self.convertHTMLEntities and \
|
---|
1332 | not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
|
---|
1333 | # TODO: We've got a problem here. We're told this is
|
---|
1334 | # an entity reference, but it's not an XML entity
|
---|
1335 | # reference or an HTML entity reference. Nonetheless,
|
---|
1336 | # the logical thing to do is to pass it through as an
|
---|
1337 | # unrecognized entity reference.
|
---|
1338 | #
|
---|
1339 | # Except: when the input is "&carol;" this function
|
---|
1340 | # will be called with input "carol". When the input is
|
---|
1341 | # "AT&T", this function will be called with input
|
---|
1342 | # "T". We have no way of knowing whether a semicolon
|
---|
1343 | # was present originally, so we don't know whether
|
---|
1344 | # this is an unknown entity or just a misplaced
|
---|
1345 | # ampersand.
|
---|
1346 | #
|
---|
1347 | # The more common case is a misplaced ampersand, so I
|
---|
1348 | # escape the ampersand and omit the trailing semicolon.
|
---|
1349 | data = "&%s" % ref
|
---|
1350 | if not data:
|
---|
1351 | # This case is different from the one above, because we
|
---|
1352 | # haven't already gone through a supposedly comprehensive
|
---|
1353 | # mapping of entities to Unicode characters. We might not
|
---|
1354 | # have gone through any mapping at all. So the chances are
|
---|
1355 | # very high that this is a real entity, and not a
|
---|
1356 | # misplaced ampersand.
|
---|
1357 | data = "&%s;" % ref
|
---|
1358 | self.handle_data(data)
|
---|
1359 |
|
---|
1360 | def handle_decl(self, data):
|
---|
1361 | "Handle DOCTYPEs and the like as Declaration objects."
|
---|
1362 | self._toStringSubclass(data, Declaration)
|
---|
1363 |
|
---|
1364 | def parse_declaration(self, i):
|
---|
1365 | """Treat a bogus SGML declaration as raw data. Treat a CDATA
|
---|
1366 | declaration as a CData object."""
|
---|
1367 | j = None
|
---|
1368 | if self.rawdata[i:i+9] == '<![CDATA[':
|
---|
1369 | k = self.rawdata.find(']]>', i)
|
---|
1370 | if k == -1:
|
---|
1371 | k = len(self.rawdata)
|
---|
1372 | data = self.rawdata[i+9:k]
|
---|
1373 | j = k+3
|
---|
1374 | self._toStringSubclass(data, CData)
|
---|
1375 | else:
|
---|
1376 | try:
|
---|
1377 | j = SGMLParser.parse_declaration(self, i)
|
---|
1378 | except SGMLParseError:
|
---|
1379 | toHandle = self.rawdata[i:]
|
---|
1380 | self.handle_data(toHandle)
|
---|
1381 | j = i + len(toHandle)
|
---|
1382 | return j
|
---|
1383 |
|
---|
1384 | class BeautifulSoup(BeautifulStoneSoup):
|
---|
1385 |
|
---|
1386 | """This parser knows the following facts about HTML:
|
---|
1387 |
|
---|
1388 | * Some tags have no closing tag and should be interpreted as being
|
---|
1389 | closed as soon as they are encountered.
|
---|
1390 |
|
---|
1391 | * The text inside some tags (ie. 'script') may contain tags which
|
---|
1392 | are not really part of the document and which should be parsed
|
---|
1393 | as text, not tags. If you want to parse the text as tags, you can
|
---|
1394 | always fetch it and parse it explicitly.
|
---|
1395 |
|
---|
1396 | * Tag nesting rules:
|
---|
1397 |
|
---|
1398 | Most tags can't be nested at all. For instance, the occurance of
|
---|
1399 | a <p> tag should implicitly close the previous <p> tag.
|
---|
1400 |
|
---|
1401 | <p>Para1<p>Para2
|
---|
1402 | should be transformed into:
|
---|
1403 | <p>Para1</p><p>Para2
|
---|
1404 |
|
---|
1405 | Some tags can be nested arbitrarily. For instance, the occurance
|
---|
1406 | of a <blockquote> tag should _not_ implicitly close the previous
|
---|
1407 | <blockquote> tag.
|
---|
1408 |
|
---|
1409 | Alice said: <blockquote>Bob said: <blockquote>Blah
|
---|
1410 | should NOT be transformed into:
|
---|
1411 | Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
|
---|
1412 |
|
---|
1413 | Some tags can be nested, but the nesting is reset by the
|
---|
1414 | interposition of other tags. For instance, a <tr> tag should
|
---|
1415 | implicitly close the previous <tr> tag within the same <table>,
|
---|
1416 | but not close a <tr> tag in another table.
|
---|
1417 |
|
---|
1418 | <table><tr>Blah<tr>Blah
|
---|
1419 | should be transformed into:
|
---|
1420 | <table><tr>Blah</tr><tr>Blah
|
---|
1421 | but,
|
---|
1422 | <tr>Blah<table><tr>Blah
|
---|
1423 | should NOT be transformed into
|
---|
1424 | <tr>Blah<table></tr><tr>Blah
|
---|
1425 |
|
---|
1426 | Differing assumptions about tag nesting rules are a major source
|
---|
1427 | of problems with the BeautifulSoup class. If BeautifulSoup is not
|
---|
1428 | treating as nestable a tag your page author treats as nestable,
|
---|
1429 | try ICantBelieveItsBeautifulSoup, MinimalSoup, or
|
---|
1430 | BeautifulStoneSoup before writing your own subclass."""
|
---|
1431 |
|
---|
1432 | def __init__(self, *args, **kwargs):
|
---|
1433 | if not kwargs.has_key('smartQuotesTo'):
|
---|
1434 | kwargs['smartQuotesTo'] = self.HTML_ENTITIES
|
---|
1435 | BeautifulStoneSoup.__init__(self, *args, **kwargs)
|
---|
1436 |
|
---|
1437 | SELF_CLOSING_TAGS = buildTagMap(None,
|
---|
1438 | ['br' , 'hr', 'input', 'img', 'meta',
|
---|
1439 | 'spacer', 'link', 'frame', 'base'])
|
---|
1440 |
|
---|
1441 | QUOTE_TAGS = {'script' : None, 'textarea' : None}
|
---|
1442 |
|
---|
1443 | #According to the HTML standard, each of these inline tags can
|
---|
1444 | #contain another tag of the same type. Furthermore, it's common
|
---|
1445 | #to actually use these tags this way.
|
---|
1446 | NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
|
---|
1447 | 'center']
|
---|
1448 |
|
---|
1449 | #According to the HTML standard, these block tags can contain
|
---|
1450 | #another tag of the same type. Furthermore, it's common
|
---|
1451 | #to actually use these tags this way.
|
---|
1452 | NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
|
---|
1453 |
|
---|
1454 | #Lists can contain other lists, but there are restrictions.
|
---|
1455 | NESTABLE_LIST_TAGS = { 'ol' : [],
|
---|
1456 | 'ul' : [],
|
---|
1457 | 'li' : ['ul', 'ol'],
|
---|
1458 | 'dl' : [],
|
---|
1459 | 'dd' : ['dl'],
|
---|
1460 | 'dt' : ['dl'] }
|
---|
1461 |
|
---|
1462 | #Tables can contain other tables, but there are restrictions.
|
---|
1463 | NESTABLE_TABLE_TAGS = {'table' : [],
|
---|
1464 | 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
|
---|
1465 | 'td' : ['tr'],
|
---|
1466 | 'th' : ['tr'],
|
---|
1467 | 'thead' : ['table'],
|
---|
1468 | 'tbody' : ['table'],
|
---|
1469 | 'tfoot' : ['table'],
|
---|
1470 | }
|
---|
1471 |
|
---|
1472 | NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
|
---|
1473 |
|
---|
1474 | #If one of these tags is encountered, all tags up to the next tag of
|
---|
1475 | #this type are popped.
|
---|
1476 | RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
|
---|
1477 | NON_NESTABLE_BLOCK_TAGS,
|
---|
1478 | NESTABLE_LIST_TAGS,
|
---|
1479 | NESTABLE_TABLE_TAGS)
|
---|
1480 |
|
---|
1481 | NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
|
---|
1482 | NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
|
---|
1483 |
|
---|
1484 | # Used to detect the charset in a META tag; see start_meta
|
---|
1485 | CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
|
---|
1486 |
|
---|
1487 | def start_meta(self, attrs):
|
---|
1488 | """Beautiful Soup can detect a charset included in a META tag,
|
---|
1489 | try to convert the document to that charset, and re-parse the
|
---|
1490 | document from the beginning."""
|
---|
1491 | httpEquiv = None
|
---|
1492 | contentType = None
|
---|
1493 | contentTypeIndex = None
|
---|
1494 | tagNeedsEncodingSubstitution = False
|
---|
1495 |
|
---|
1496 | for i in range(0, len(attrs)):
|
---|
1497 | key, value = attrs[i]
|
---|
1498 | key = key.lower()
|
---|
1499 | if key == 'http-equiv':
|
---|
1500 | httpEquiv = value
|
---|
1501 | elif key == 'content':
|
---|
1502 | contentType = value
|
---|
1503 | contentTypeIndex = i
|
---|
1504 |
|
---|
1505 | if httpEquiv and contentType: # It's an interesting meta tag.
|
---|
1506 | match = self.CHARSET_RE.search(contentType)
|
---|
1507 | if match:
|
---|
1508 | if getattr(self, 'declaredHTMLEncoding') or \
|
---|
1509 | (self.originalEncoding == self.fromEncoding):
|
---|
1510 | # This is our second pass through the document, or
|
---|
1511 | # else an encoding was specified explicitly and it
|
---|
1512 | # worked. Rewrite the meta tag.
|
---|
1513 | newAttr = self.CHARSET_RE.sub\
|
---|
1514 | (lambda(match):match.group(1) +
|
---|
1515 | "%SOUP-ENCODING%", value)
|
---|
1516 | attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
|
---|
1517 | newAttr)
|
---|
1518 | tagNeedsEncodingSubstitution = True
|
---|
1519 | else:
|
---|
1520 | # This is our first pass through the document.
|
---|
1521 | # Go through it again with the new information.
|
---|
1522 | newCharset = match.group(3)
|
---|
1523 | if newCharset and newCharset != self.originalEncoding:
|
---|
1524 | self.declaredHTMLEncoding = newCharset
|
---|
1525 | self._feed(self.declaredHTMLEncoding)
|
---|
1526 | raise StopParsing
|
---|
1527 | tag = self.unknown_starttag("meta", attrs)
|
---|
1528 | if tag and tagNeedsEncodingSubstitution:
|
---|
1529 | tag.containsSubstitutions = True
|
---|
1530 |
|
---|
1531 | class StopParsing(Exception):
|
---|
1532 | pass
|
---|
1533 |
|
---|
1534 | class ICantBelieveItsBeautifulSoup(BeautifulSoup):
|
---|
1535 |
|
---|
1536 | """The BeautifulSoup class is oriented towards skipping over
|
---|
1537 | common HTML errors like unclosed tags. However, sometimes it makes
|
---|
1538 | errors of its own. For instance, consider this fragment:
|
---|
1539 |
|
---|
1540 | <b>Foo<b>Bar</b></b>
|
---|
1541 |
|
---|
1542 | This is perfectly valid (if bizarre) HTML. However, the
|
---|
1543 | BeautifulSoup class will implicitly close the first b tag when it
|
---|
1544 | encounters the second 'b'. It will think the author wrote
|
---|
1545 | "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
|
---|
1546 | there's no real-world reason to bold something that's already
|
---|
1547 | bold. When it encounters '</b></b>' it will close two more 'b'
|
---|
1548 | tags, for a grand total of three tags closed instead of two. This
|
---|
1549 | can throw off the rest of your document structure. The same is
|
---|
1550 | true of a number of other tags, listed below.
|
---|
1551 |
|
---|
1552 | It's much more common for someone to forget to close a 'b' tag
|
---|
1553 | than to actually use nested 'b' tags, and the BeautifulSoup class
|
---|
1554 | handles the common case. This class handles the not-co-common
|
---|
1555 | case: where you can't believe someone wrote what they did, but
|
---|
1556 | it's valid HTML and BeautifulSoup screwed up by assuming it
|
---|
1557 | wouldn't be."""
|
---|
1558 |
|
---|
1559 | I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
|
---|
1560 | ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
|
---|
1561 | 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
|
---|
1562 | 'big']
|
---|
1563 |
|
---|
1564 | I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
|
---|
1565 |
|
---|
1566 | NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
|
---|
1567 | I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
|
---|
1568 | I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
|
---|
1569 |
|
---|
1570 | class MinimalSoup(BeautifulSoup):
|
---|
1571 | """The MinimalSoup class is for parsing HTML that contains
|
---|
1572 | pathologically bad markup. It makes no assumptions about tag
|
---|
1573 | nesting, but it does know which tags are self-closing, that
|
---|
1574 | <script> tags contain Javascript and should not be parsed, that
|
---|
1575 | META tags may contain encoding information, and so on.
|
---|
1576 |
|
---|
1577 | This also makes it better for subclassing than BeautifulStoneSoup
|
---|
1578 | or BeautifulSoup."""
|
---|
1579 |
|
---|
1580 | RESET_NESTING_TAGS = buildTagMap('noscript')
|
---|
1581 | NESTABLE_TAGS = {}
|
---|
1582 |
|
---|
1583 | class BeautifulSOAP(BeautifulStoneSoup):
|
---|
1584 | """This class will push a tag with only a single string child into
|
---|
1585 | the tag's parent as an attribute. The attribute's name is the tag
|
---|
1586 | name, and the value is the string child. An example should give
|
---|
1587 | the flavor of the change:
|
---|
1588 |
|
---|
1589 | <foo><bar>baz</bar></foo>
|
---|
1590 | =>
|
---|
1591 | <foo bar="baz"><bar>baz</bar></foo>
|
---|
1592 |
|
---|
1593 | You can then access fooTag['bar'] instead of fooTag.barTag.string.
|
---|
1594 |
|
---|
1595 | This is, of course, useful for scraping structures that tend to
|
---|
1596 | use subelements instead of attributes, such as SOAP messages. Note
|
---|
1597 | that it modifies its input, so don't print the modified version
|
---|
1598 | out.
|
---|
1599 |
|
---|
1600 | I'm not sure how many people really want to use this class; let me
|
---|
1601 | know if you do. Mainly I like the name."""
|
---|
1602 |
|
---|
1603 | def popTag(self):
|
---|
1604 | if len(self.tagStack) > 1:
|
---|
1605 | tag = self.tagStack[-1]
|
---|
1606 | parent = self.tagStack[-2]
|
---|
1607 | parent._getAttrMap()
|
---|
1608 | if (isinstance(tag, Tag) and len(tag.contents) == 1 and
|
---|
1609 | isinstance(tag.contents[0], NavigableString) and
|
---|
1610 | not parent.attrMap.has_key(tag.name)):
|
---|
1611 | parent[tag.name] = tag.contents[0]
|
---|
1612 | BeautifulStoneSoup.popTag(self)
|
---|
1613 |
|
---|
1614 | #Enterprise class names! It has come to our attention that some people
|
---|
1615 | #think the names of the Beautiful Soup parser classes are too silly
|
---|
1616 | #and "unprofessional" for use in enterprise screen-scraping. We feel
|
---|
1617 | #your pain! For such-minded folk, the Beautiful Soup Consortium And
|
---|
1618 | #All-Night Kosher Bakery recommends renaming this file to
|
---|
1619 | #"RobustParser.py" (or, in cases of extreme enterprisiness,
|
---|
1620 | #"RobustParserBeanInterface.class") and using the following
|
---|
1621 | #enterprise-friendly class aliases:
|
---|
1622 | class RobustXMLParser(BeautifulStoneSoup):
|
---|
1623 | pass
|
---|
1624 | class RobustHTMLParser(BeautifulSoup):
|
---|
1625 | pass
|
---|
1626 | class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
|
---|
1627 | pass
|
---|
1628 | class RobustInsanelyWackAssHTMLParser(MinimalSoup):
|
---|
1629 | pass
|
---|
1630 | class SimplifyingSOAPParser(BeautifulSOAP):
|
---|
1631 | pass
|
---|
1632 |
|
---|
1633 | ######################################################
|
---|
1634 | #
|
---|
1635 | # Bonus library: Unicode, Dammit
|
---|
1636 | #
|
---|
1637 | # This class forces XML data into a standard format (usually to UTF-8
|
---|
1638 | # or Unicode). It is heavily based on code from Mark Pilgrim's
|
---|
1639 | # Universal Feed Parser. It does not rewrite the XML or HTML to
|
---|
1640 | # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
|
---|
1641 | # (XML) and BeautifulSoup.start_meta (HTML).
|
---|
1642 |
|
---|
1643 | # Autodetects character encodings.
|
---|
1644 | # Download from http://chardet.feedparser.org/
|
---|
1645 | try:
|
---|
1646 | import chardet
|
---|
1647 | # import chardet.constants
|
---|
1648 | # chardet.constants._debug = 1
|
---|
1649 | except:
|
---|
1650 | chardet = None
|
---|
1651 | chardet = None
|
---|
1652 |
|
---|
1653 | # cjkcodecs and iconv_codec make Python know about more character encodings.
|
---|
1654 | # Both are available from http://cjkpython.i18n.org/
|
---|
1655 | # They're built in if you use Python 2.4.
|
---|
1656 | try:
|
---|
1657 | import cjkcodecs.aliases
|
---|
1658 | except:
|
---|
1659 | pass
|
---|
1660 | try:
|
---|
1661 | import iconv_codec
|
---|
1662 | except:
|
---|
1663 | pass
|
---|
1664 |
|
---|
1665 | class UnicodeDammit:
|
---|
1666 | """A class for detecting the encoding of a *ML document and
|
---|
1667 | converting it to a Unicode string. If the source encoding is
|
---|
1668 | windows-1252, can replace MS smart quotes with their HTML or XML
|
---|
1669 | equivalents."""
|
---|
1670 |
|
---|
1671 | # This dictionary maps commonly seen values for "charset" in HTML
|
---|
1672 | # meta tags to the corresponding Python codec names. It only covers
|
---|
1673 | # values that aren't in Python's aliases and can't be determined
|
---|
1674 | # by the heuristics in find_codec.
|
---|
1675 | CHARSET_ALIASES = { "macintosh" : "mac-roman",
|
---|
1676 | "x-sjis" : "shift-jis" }
|
---|
1677 |
|
---|
1678 | def __init__(self, markup, overrideEncodings=[],
|
---|
1679 | smartQuotesTo='xml'):
|
---|
1680 | self.markup, documentEncoding, sniffedEncoding = \
|
---|
1681 | self._detectEncoding(markup)
|
---|
1682 | self.smartQuotesTo = smartQuotesTo
|
---|
1683 | self.triedEncodings = []
|
---|
1684 | if markup == '' or isinstance(markup, unicode):
|
---|
1685 | self.originalEncoding = None
|
---|
1686 | self.unicode = unicode(markup)
|
---|
1687 | return
|
---|
1688 |
|
---|
1689 | u = None
|
---|
1690 | for proposedEncoding in overrideEncodings:
|
---|
1691 | u = self._convertFrom(proposedEncoding)
|
---|
1692 | if u: break
|
---|
1693 | if not u:
|
---|
1694 | for proposedEncoding in (documentEncoding, sniffedEncoding):
|
---|
1695 | u = self._convertFrom(proposedEncoding)
|
---|
1696 | if u: break
|
---|
1697 |
|
---|
1698 | # If no luck and we have auto-detection library, try that:
|
---|
1699 | if not u and chardet and not isinstance(self.markup, unicode):
|
---|
1700 | u = self._convertFrom(chardet.detect(self.markup)['encoding'])
|
---|
1701 |
|
---|
1702 | # As a last resort, try utf-8 and windows-1252:
|
---|
1703 | if not u:
|
---|
1704 | for proposed_encoding in ("utf-8", "windows-1252"):
|
---|
1705 | u = self._convertFrom(proposed_encoding)
|
---|
1706 | if u: break
|
---|
1707 | self.unicode = u
|
---|
1708 | if not u: self.originalEncoding = None
|
---|
1709 |
|
---|
1710 | def _subMSChar(self, orig):
|
---|
1711 | """Changes a MS smart quote character to an XML or HTML
|
---|
1712 | entity."""
|
---|
1713 | sub = self.MS_CHARS.get(orig)
|
---|
1714 | if type(sub) == types.TupleType:
|
---|
1715 | if self.smartQuotesTo == 'xml':
|
---|
1716 | sub = '&#x%s;' % sub[1]
|
---|
1717 | else:
|
---|
1718 | sub = '&%s;' % sub[0]
|
---|
1719 | return sub
|
---|
1720 |
|
---|
1721 | def _convertFrom(self, proposed):
|
---|
1722 | proposed = self.find_codec(proposed)
|
---|
1723 | if not proposed or proposed in self.triedEncodings:
|
---|
1724 | return None
|
---|
1725 | self.triedEncodings.append(proposed)
|
---|
1726 | markup = self.markup
|
---|
1727 |
|
---|
1728 | # Convert smart quotes to HTML if coming from an encoding
|
---|
1729 | # that might have them.
|
---|
1730 | if self.smartQuotesTo and proposed.lower() in("windows-1252",
|
---|
1731 | "iso-8859-1",
|
---|
1732 | "iso-8859-2"):
|
---|
1733 | markup = re.compile("([\x80-\x9f])").sub \
|
---|
1734 | (lambda(x): self._subMSChar(x.group(1)),
|
---|
1735 | markup)
|
---|
1736 |
|
---|
1737 | try:
|
---|
1738 | # print "Trying to convert document to %s" % proposed
|
---|
1739 | u = self._toUnicode(markup, proposed)
|
---|
1740 | self.markup = u
|
---|
1741 | self.originalEncoding = proposed
|
---|
1742 | except Exception, e:
|
---|
1743 | # print "That didn't work!"
|
---|
1744 | # print e
|
---|
1745 | return None
|
---|
1746 | #print "Correct encoding: %s" % proposed
|
---|
1747 | return self.markup
|
---|
1748 |
|
---|
1749 | def _toUnicode(self, data, encoding):
|
---|
1750 | '''Given a string and its encoding, decodes the string into Unicode.
|
---|
1751 | %encoding is a string recognized by encodings.aliases'''
|
---|
1752 |
|
---|
1753 | # strip Byte Order Mark (if present)
|
---|
1754 | if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
|
---|
1755 | and (data[2:4] != '\x00\x00'):
|
---|
1756 | encoding = 'utf-16be'
|
---|
1757 | data = data[2:]
|
---|
1758 | elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
|
---|
1759 | and (data[2:4] != '\x00\x00'):
|
---|
1760 | encoding = 'utf-16le'
|
---|
1761 | data = data[2:]
|
---|
1762 | elif data[:3] == '\xef\xbb\xbf':
|
---|
1763 | encoding = 'utf-8'
|
---|
1764 | data = data[3:]
|
---|
1765 | elif data[:4] == '\x00\x00\xfe\xff':
|
---|
1766 | encoding = 'utf-32be'
|
---|
1767 | data = data[4:]
|
---|
1768 | elif data[:4] == '\xff\xfe\x00\x00':
|
---|
1769 | encoding = 'utf-32le'
|
---|
1770 | data = data[4:]
|
---|
1771 | newdata = unicode(data, encoding)
|
---|
1772 | return newdata
|
---|
1773 |
|
---|
1774 | def _detectEncoding(self, xml_data):
|
---|
1775 | """Given a document, tries to detect its XML encoding."""
|
---|
1776 | xml_encoding = sniffed_xml_encoding = None
|
---|
1777 | try:
|
---|
1778 | if xml_data[:4] == '\x4c\x6f\xa7\x94':
|
---|
1779 | # EBCDIC
|
---|
1780 | xml_data = self._ebcdic_to_ascii(xml_data)
|
---|
1781 | elif xml_data[:4] == '\x00\x3c\x00\x3f':
|
---|
1782 | # UTF-16BE
|
---|
1783 | sniffed_xml_encoding = 'utf-16be'
|
---|
1784 | xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
|
---|
1785 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
|
---|
1786 | and (xml_data[2:4] != '\x00\x00'):
|
---|
1787 | # UTF-16BE with BOM
|
---|
1788 | sniffed_xml_encoding = 'utf-16be'
|
---|
1789 | xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
|
---|
1790 | elif xml_data[:4] == '\x3c\x00\x3f\x00':
|
---|
1791 | # UTF-16LE
|
---|
1792 | sniffed_xml_encoding = 'utf-16le'
|
---|
1793 | xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
|
---|
1794 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
|
---|
1795 | (xml_data[2:4] != '\x00\x00'):
|
---|
1796 | # UTF-16LE with BOM
|
---|
1797 | sniffed_xml_encoding = 'utf-16le'
|
---|
1798 | xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
|
---|
1799 | elif xml_data[:4] == '\x00\x00\x00\x3c':
|
---|
1800 | # UTF-32BE
|
---|
1801 | sniffed_xml_encoding = 'utf-32be'
|
---|
1802 | xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
|
---|
1803 | elif xml_data[:4] == '\x3c\x00\x00\x00':
|
---|
1804 | # UTF-32LE
|
---|
1805 | sniffed_xml_encoding = 'utf-32le'
|
---|
1806 | xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
|
---|
1807 | elif xml_data[:4] == '\x00\x00\xfe\xff':
|
---|
1808 | # UTF-32BE with BOM
|
---|
1809 | sniffed_xml_encoding = 'utf-32be'
|
---|
1810 | xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
|
---|
1811 | elif xml_data[:4] == '\xff\xfe\x00\x00':
|
---|
1812 | # UTF-32LE with BOM
|
---|
1813 | sniffed_xml_encoding = 'utf-32le'
|
---|
1814 | xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
|
---|
1815 | elif xml_data[:3] == '\xef\xbb\xbf':
|
---|
1816 | # UTF-8 with BOM
|
---|
1817 | sniffed_xml_encoding = 'utf-8'
|
---|
1818 | xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
|
---|
1819 | else:
|
---|
1820 | sniffed_xml_encoding = 'ascii'
|
---|
1821 | pass
|
---|
1822 | xml_encoding_match = re.compile \
|
---|
1823 | ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
|
---|
1824 | .match(xml_data)
|
---|
1825 | except:
|
---|
1826 | xml_encoding_match = None
|
---|
1827 | if xml_encoding_match:
|
---|
1828 | xml_encoding = xml_encoding_match.groups()[0].lower()
|
---|
1829 | if sniffed_xml_encoding and \
|
---|
1830 | (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
|
---|
1831 | 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
|
---|
1832 | 'utf-16', 'utf-32', 'utf_16', 'utf_32',
|
---|
1833 | 'utf16', 'u16')):
|
---|
1834 | xml_encoding = sniffed_xml_encoding
|
---|
1835 | return xml_data, xml_encoding, sniffed_xml_encoding
|
---|
1836 |
|
---|
1837 |
|
---|
1838 | def find_codec(self, charset):
|
---|
1839 | return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
|
---|
1840 | or (charset and self._codec(charset.replace("-", ""))) \
|
---|
1841 | or (charset and self._codec(charset.replace("-", "_"))) \
|
---|
1842 | or charset
|
---|
1843 |
|
---|
1844 | def _codec(self, charset):
|
---|
1845 | if not charset: return charset
|
---|
1846 | codec = None
|
---|
1847 | try:
|
---|
1848 | codecs.lookup(charset)
|
---|
1849 | codec = charset
|
---|
1850 | except (LookupError, ValueError):
|
---|
1851 | pass
|
---|
1852 | return codec
|
---|
1853 |
|
---|
1854 | EBCDIC_TO_ASCII_MAP = None
|
---|
1855 | def _ebcdic_to_ascii(self, s):
|
---|
1856 | c = self.__class__
|
---|
1857 | if not c.EBCDIC_TO_ASCII_MAP:
|
---|
1858 | emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
|
---|
1859 | 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
|
---|
1860 | 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
|
---|
1861 | 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
|
---|
1862 | 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
|
---|
1863 | 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
|
---|
1864 | 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
|
---|
1865 | 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
|
---|
1866 | 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
|
---|
1867 | 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
|
---|
1868 | 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
|
---|
1869 | 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
|
---|
1870 | 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
|
---|
1871 | 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
|
---|
1872 | 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
|
---|
1873 | 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
|
---|
1874 | 250,251,252,253,254,255)
|
---|
1875 | import string
|
---|
1876 | c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
|
---|
1877 | ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
|
---|
1878 | return s.translate(c.EBCDIC_TO_ASCII_MAP)
|
---|
1879 |
|
---|
1880 | MS_CHARS = { '\x80' : ('euro', '20AC'),
|
---|
1881 | '\x81' : ' ',
|
---|
1882 | '\x82' : ('sbquo', '201A'),
|
---|
1883 | '\x83' : ('fnof', '192'),
|
---|
1884 | '\x84' : ('bdquo', '201E'),
|
---|
1885 | '\x85' : ('hellip', '2026'),
|
---|
1886 | '\x86' : ('dagger', '2020'),
|
---|
1887 | '\x87' : ('Dagger', '2021'),
|
---|
1888 | '\x88' : ('circ', '2C6'),
|
---|
1889 | '\x89' : ('permil', '2030'),
|
---|
1890 | '\x8A' : ('Scaron', '160'),
|
---|
1891 | '\x8B' : ('lsaquo', '2039'),
|
---|
1892 | '\x8C' : ('OElig', '152'),
|
---|
1893 | '\x8D' : '?',
|
---|
1894 | '\x8E' : ('#x17D', '17D'),
|
---|
1895 | '\x8F' : '?',
|
---|
1896 | '\x90' : '?',
|
---|
1897 | '\x91' : ('lsquo', '2018'),
|
---|
1898 | '\x92' : ('rsquo', '2019'),
|
---|
1899 | '\x93' : ('ldquo', '201C'),
|
---|
1900 | '\x94' : ('rdquo', '201D'),
|
---|
1901 | '\x95' : ('bull', '2022'),
|
---|
1902 | '\x96' : ('ndash', '2013'),
|
---|
1903 | '\x97' : ('mdash', '2014'),
|
---|
1904 | '\x98' : ('tilde', '2DC'),
|
---|
1905 | '\x99' : ('trade', '2122'),
|
---|
1906 | '\x9a' : ('scaron', '161'),
|
---|
1907 | '\x9b' : ('rsaquo', '203A'),
|
---|
1908 | '\x9c' : ('oelig', '153'),
|
---|
1909 | '\x9d' : '?',
|
---|
1910 | '\x9e' : ('#x17E', '17E'),
|
---|
1911 | '\x9f' : ('Yuml', ''),}
|
---|
1912 |
|
---|
1913 | #######################################################################
|
---|
1914 |
|
---|
1915 |
|
---|
1916 | #By default, act as an HTML pretty-printer.
|
---|
1917 | if __name__ == '__main__':
|
---|
1918 | import sys
|
---|
1919 | soup = BeautifulSoup(sys.stdin.read())
|
---|
1920 | print soup.prettify()
|
---|