source: gs2-extensions/ngramj/src/wiki/BeautifulSoup.py@ 31803

Last change on this file since 31803 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 74.2 KB
Line 
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7tree representation. It provides methods and Pythonic idioms that make
8it easy to navigate, search, and modify the tree.
9
10A well-formed XML/HTML document yields a well-formed data
11structure. An ill-formed XML/HTML document yields a correspondingly
12ill-formed data structure. If your document is only locally
13well-formed, you can use this library to find and process the
14well-formed part of it.
15
16Beautiful Soup works with Python 2.2 and up. It has no external
17dependencies, but you'll have more success at converting data to UTF-8
18if you also install these three packages:
19
20* chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 by stock Python.
24 http://cjkpython.i18n.org/
25
26Beautiful Soup defines classes for two main parsing strategies:
27
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
34
35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36the encoding of an HTML or XML document, and converting it to
37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39For more than you ever wanted to know about Beautiful Soup, see the
40documentation:
41http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43Here, have some legalese:
44
45Copyright (c) 2004-2007, Leonard Richardson
46
47All rights reserved.
48
49Redistribution and use in source and binary forms, with or without
50modification, are permitted provided that the following conditions are
51met:
52
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
55
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
60
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
65
66THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
78"""
79from __future__ import generators
80
81__author__ = "Leonard Richardson ([email protected])"
82__version__ = "3.0.5"
83__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
84__license__ = "New-style BSD"
85
86from sgmllib import SGMLParser, SGMLParseError
87import codecs
88import types
89import re
90import sgmllib
91try:
92 from htmlentitydefs import name2codepoint
93except ImportError:
94 name2codepoint = {}
95
96#This hack makes Beautiful Soup able to parse XML with namespaces
97sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
98
99DEFAULT_OUTPUT_ENCODING = "utf-8"
100
101# First, the classes that represent markup elements.
102
103class PageElement:
104 """Contains the navigational information for some part of the page
105 (either a tag or a piece of text)"""
106
107 def setup(self, parent=None, previous=None):
108 """Sets up the initial relations between this element and
109 other elements."""
110 self.parent = parent
111 self.previous = previous
112 self.next = None
113 self.previousSibling = None
114 self.nextSibling = None
115 if self.parent and self.parent.contents:
116 self.previousSibling = self.parent.contents[-1]
117 self.previousSibling.nextSibling = self
118
119 def replaceWith(self, replaceWith):
120 oldParent = self.parent
121 myIndex = self.parent.contents.index(self)
122 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
123 # We're replacing this element with one of its siblings.
124 index = self.parent.contents.index(replaceWith)
125 if index and index < myIndex:
126 # Furthermore, it comes before this element. That
127 # means that when we extract it, the index of this
128 # element will change.
129 myIndex = myIndex - 1
130 self.extract()
131 oldParent.insert(myIndex, replaceWith)
132
133 def extract(self):
134 """Destructively rips this element out of the tree."""
135 if self.parent:
136 try:
137 self.parent.contents.remove(self)
138 except ValueError:
139 pass
140
141 #Find the two elements that would be next to each other if
142 #this element (and any children) hadn't been parsed. Connect
143 #the two.
144 lastChild = self._lastRecursiveChild()
145 nextElement = lastChild.next
146
147 if self.previous:
148 self.previous.next = nextElement
149 if nextElement:
150 nextElement.previous = self.previous
151 self.previous = None
152 lastChild.next = None
153
154 self.parent = None
155 if self.previousSibling:
156 self.previousSibling.nextSibling = self.nextSibling
157 if self.nextSibling:
158 self.nextSibling.previousSibling = self.previousSibling
159 self.previousSibling = self.nextSibling = None
160
161 def _lastRecursiveChild(self):
162 "Finds the last element beneath this object to be parsed."
163 lastChild = self
164 while hasattr(lastChild, 'contents') and lastChild.contents:
165 lastChild = lastChild.contents[-1]
166 return lastChild
167
168 def insert(self, position, newChild):
169 if (isinstance(newChild, basestring)
170 or isinstance(newChild, unicode)) \
171 and not isinstance(newChild, NavigableString):
172 newChild = NavigableString(newChild)
173
174 position = min(position, len(self.contents))
175 if hasattr(newChild, 'parent') and newChild.parent != None:
176 # We're 'inserting' an element that's already one
177 # of this object's children.
178 if newChild.parent == self:
179 index = self.find(newChild)
180 if index and index < position:
181 # Furthermore we're moving it further down the
182 # list of this object's children. That means that
183 # when we extract this element, our target index
184 # will jump down one.
185 position = position - 1
186 newChild.extract()
187
188 newChild.parent = self
189 previousChild = None
190 if position == 0:
191 newChild.previousSibling = None
192 newChild.previous = self
193 else:
194 previousChild = self.contents[position-1]
195 newChild.previousSibling = previousChild
196 newChild.previousSibling.nextSibling = newChild
197 newChild.previous = previousChild._lastRecursiveChild()
198 if newChild.previous:
199 newChild.previous.next = newChild
200
201 newChildsLastElement = newChild._lastRecursiveChild()
202
203 if position >= len(self.contents):
204 newChild.nextSibling = None
205
206 parent = self
207 parentsNextSibling = None
208 while not parentsNextSibling:
209 parentsNextSibling = parent.nextSibling
210 parent = parent.parent
211 if not parent: # This is the last element in the document.
212 break
213 if parentsNextSibling:
214 newChildsLastElement.next = parentsNextSibling
215 else:
216 newChildsLastElement.next = None
217 else:
218 nextChild = self.contents[position]
219 newChild.nextSibling = nextChild
220 if newChild.nextSibling:
221 newChild.nextSibling.previousSibling = newChild
222 newChildsLastElement.next = nextChild
223
224 if newChildsLastElement.next:
225 newChildsLastElement.next.previous = newChildsLastElement
226 self.contents.insert(position, newChild)
227
228 def append(self, tag):
229 """Appends the given tag to the contents of this tag."""
230 self.insert(len(self.contents), tag)
231
232 def findNext(self, name=None, attrs={}, text=None, **kwargs):
233 """Returns the first item that matches the given criteria and
234 appears after this Tag in the document."""
235 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
236
237 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
238 **kwargs):
239 """Returns all items that match the given criteria and appear
240 before after Tag in the document."""
241 return self._findAll(name, attrs, text, limit, self.nextGenerator)
242
243 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
244 """Returns the closest sibling to this Tag that matches the
245 given criteria and appears after this Tag in the document."""
246 return self._findOne(self.findNextSiblings, name, attrs, text,
247 **kwargs)
248
249 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
250 **kwargs):
251 """Returns the siblings of this Tag that match the given
252 criteria and appear after this Tag in the document."""
253 return self._findAll(name, attrs, text, limit,
254 self.nextSiblingGenerator, **kwargs)
255 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
256
257 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
258 """Returns the first item that matches the given criteria and
259 appears before this Tag in the document."""
260 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
261
262 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
263 **kwargs):
264 """Returns all items that match the given criteria and appear
265 before this Tag in the document."""
266 return self._findAll(name, attrs, text, limit, self.previousGenerator,
267 **kwargs)
268 fetchPrevious = findAllPrevious # Compatibility with pre-3.x
269
270 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
271 """Returns the closest sibling to this Tag that matches the
272 given criteria and appears before this Tag in the document."""
273 return self._findOne(self.findPreviousSiblings, name, attrs, text,
274 **kwargs)
275
276 def findPreviousSiblings(self, name=None, attrs={}, text=None,
277 limit=None, **kwargs):
278 """Returns the siblings of this Tag that match the given
279 criteria and appear before this Tag in the document."""
280 return self._findAll(name, attrs, text, limit,
281 self.previousSiblingGenerator, **kwargs)
282 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
283
284 def findParent(self, name=None, attrs={}, **kwargs):
285 """Returns the closest parent of this Tag that matches the given
286 criteria."""
287 # NOTE: We can't use _findOne because findParents takes a different
288 # set of arguments.
289 r = None
290 l = self.findParents(name, attrs, 1)
291 if l:
292 r = l[0]
293 return r
294
295 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
296 """Returns the parents of this Tag that match the given
297 criteria."""
298
299 return self._findAll(name, attrs, None, limit, self.parentGenerator,
300 **kwargs)
301 fetchParents = findParents # Compatibility with pre-3.x
302
303 #These methods do the real heavy lifting.
304
305 def _findOne(self, method, name, attrs, text, **kwargs):
306 r = None
307 l = method(name, attrs, text, 1, **kwargs)
308 if l:
309 r = l[0]
310 return r
311
312 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
313 "Iterates over a generator looking for things that match."
314
315 if isinstance(name, SoupStrainer):
316 strainer = name
317 else:
318 # Build a SoupStrainer
319 strainer = SoupStrainer(name, attrs, text, **kwargs)
320 results = ResultSet(strainer)
321 g = generator()
322 while True:
323 try:
324 i = g.next()
325 except StopIteration:
326 break
327 if i:
328 found = strainer.search(i)
329 if found:
330 results.append(found)
331 if limit and len(results) >= limit:
332 break
333 return results
334
335 #These Generators can be used to navigate starting from both
336 #NavigableStrings and Tags.
337 def nextGenerator(self):
338 i = self
339 while i:
340 i = i.next
341 yield i
342
343 def nextSiblingGenerator(self):
344 i = self
345 while i:
346 i = i.nextSibling
347 yield i
348
349 def previousGenerator(self):
350 i = self
351 while i:
352 i = i.previous
353 yield i
354
355 def previousSiblingGenerator(self):
356 i = self
357 while i:
358 i = i.previousSibling
359 yield i
360
361 def parentGenerator(self):
362 i = self
363 while i:
364 i = i.parent
365 yield i
366
367 # Utility methods
368 def substituteEncoding(self, str, encoding=None):
369 encoding = encoding or "utf-8"
370 return str.replace("%SOUP-ENCODING%", encoding)
371
372 def toEncoding(self, s, encoding=None):
373 """Encodes an object to a string in some encoding, or to Unicode.
374 ."""
375 if isinstance(s, unicode):
376 if encoding:
377 s = s.encode(encoding)
378 elif isinstance(s, str):
379 if encoding:
380 s = s.encode(encoding)
381 else:
382 s = unicode(s)
383 else:
384 if encoding:
385 s = self.toEncoding(str(s), encoding)
386 else:
387 s = unicode(s)
388 return s
389
390class NavigableString(unicode, PageElement):
391
392 def __getnewargs__(self):
393 return (NavigableString.__str__(self),)
394
395 def __getattr__(self, attr):
396 """text.string gives you text. This is for backwards
397 compatibility for Navigable*String, but for CData* it lets you
398 get the string without the CData wrapper."""
399 if attr == 'string':
400 return self
401 else:
402 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
403
404 def __unicode__(self):
405 return unicode(str(self))
406
407 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
408 if encoding:
409 return self.encode(encoding)
410 else:
411 return self
412
413class CData(NavigableString):
414
415 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
416 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
417
418class ProcessingInstruction(NavigableString):
419 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
420 output = self
421 if "%SOUP-ENCODING%" in output:
422 output = self.substituteEncoding(output, encoding)
423 return "<?%s?>" % self.toEncoding(output, encoding)
424
425class Comment(NavigableString):
426 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
427 return "<!--%s-->" % NavigableString.__str__(self, encoding)
428
429class Declaration(NavigableString):
430 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
431 return "<!%s>" % NavigableString.__str__(self, encoding)
432
433class Tag(PageElement):
434
435 """Represents a found HTML tag with its attributes and contents."""
436
437 def _invert(h):
438 "Cheap function to invert a hash."
439 i = {}
440 for k,v in h.items():
441 i[v] = k
442 return i
443
444 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
445 "quot" : '"',
446 "amp" : "&",
447 "lt" : "<",
448 "gt" : ">" }
449
450 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
451
452 def _convertEntities(self, match):
453 """Used in a call to re.sub to replace HTML, XML, and numeric
454 entities with the appropriate Unicode characters. If HTML
455 entities are being converted, any unrecognized entities are
456 escaped."""
457 x = match.group(1)
458 if self.convertHTMLEntities and x in name2codepoint:
459 return unichr(name2codepoint[x])
460 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
461 if self.convertXMLEntities:
462 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
463 else:
464 return u'&%s;' % x
465 elif len(x) > 0 and x[0] == '#':
466 # Handle numeric entities
467 if len(x) > 1 and x[1] == 'x':
468 return unichr(int(x[2:], 16))
469 else:
470 return unichr(int(x[1:]))
471
472 elif self.escapeUnrecognizedEntities:
473 return u'&amp;%s;' % x
474 else:
475 return u'&%s;' % x
476
477 def __init__(self, parser, name, attrs=None, parent=None,
478 previous=None):
479 "Basic constructor."
480
481 # We don't actually store the parser object: that lets extracted
482 # chunks be garbage-collected
483 self.parserClass = parser.__class__
484 self.isSelfClosing = parser.isSelfClosingTag(name)
485 self.name = name
486 if attrs == None:
487 attrs = []
488 self.attrs = attrs
489 self.contents = []
490 self.setup(parent, previous)
491 self.hidden = False
492 self.containsSubstitutions = False
493 self.convertHTMLEntities = parser.convertHTMLEntities
494 self.convertXMLEntities = parser.convertXMLEntities
495 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
496
497 # Convert any HTML, XML, or numeric entities in the attribute values.
498 convert = lambda(k, val): (k,
499 re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
500 self._convertEntities,
501 val))
502 self.attrs = map(convert, self.attrs)
503
504 def get(self, key, default=None):
505 """Returns the value of the 'key' attribute for the tag, or
506 the value given for 'default' if it doesn't have that
507 attribute."""
508 return self._getAttrMap().get(key, default)
509
510 def has_key(self, key):
511 return self._getAttrMap().has_key(key)
512
513 def __getitem__(self, key):
514 """tag[key] returns the value of the 'key' attribute for the tag,
515 and throws an exception if it's not there."""
516 return self._getAttrMap()[key]
517
518 def __iter__(self):
519 "Iterating over a tag iterates over its contents."
520 return iter(self.contents)
521
522 def __len__(self):
523 "The length of a tag is the length of its list of contents."
524 return len(self.contents)
525
526 def __contains__(self, x):
527 return x in self.contents
528
529 def __nonzero__(self):
530 "A tag is non-None even if it has no contents."
531 return True
532
533 def __setitem__(self, key, value):
534 """Setting tag[key] sets the value of the 'key' attribute for the
535 tag."""
536 self._getAttrMap()
537 self.attrMap[key] = value
538 found = False
539 for i in range(0, len(self.attrs)):
540 if self.attrs[i][0] == key:
541 self.attrs[i] = (key, value)
542 found = True
543 if not found:
544 self.attrs.append((key, value))
545 self._getAttrMap()[key] = value
546
547 def __delitem__(self, key):
548 "Deleting tag[key] deletes all 'key' attributes for the tag."
549 for item in self.attrs:
550 if item[0] == key:
551 self.attrs.remove(item)
552 #We don't break because bad HTML can define the same
553 #attribute multiple times.
554 self._getAttrMap()
555 if self.attrMap.has_key(key):
556 del self.attrMap[key]
557
558 def __call__(self, *args, **kwargs):
559 """Calling a tag like a function is the same as calling its
560 findAll() method. Eg. tag('a') returns a list of all the A tags
561 found within this tag."""
562 return apply(self.findAll, args, kwargs)
563
564 def __getattr__(self, tag):
565 #print "Getattr %s.%s" % (self.__class__, tag)
566 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
567 return self.find(tag[:-3])
568 elif tag.find('__') != 0:
569 return self.find(tag)
570 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
571
572 def __eq__(self, other):
573 """Returns true iff this tag has the same name, the same attributes,
574 and the same contents (recursively) as the given tag.
575
576 NOTE: right now this will return false if two tags have the
577 same attributes in a different order. Should this be fixed?"""
578 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
579 return False
580 for i in range(0, len(self.contents)):
581 if self.contents[i] != other.contents[i]:
582 return False
583 return True
584
585 def __ne__(self, other):
586 """Returns true iff this tag is not identical to the other tag,
587 as defined in __eq__."""
588 return not self == other
589
590 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
591 """Renders this tag as a string."""
592 return self.__str__(encoding)
593
594 def __unicode__(self):
595 return self.__str__(None)
596
597 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
598 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
599 + ")")
600
601 def _sub_entity(self, x):
602 """Used with a regular expression to substitute the
603 appropriate XML entity for an XML special character."""
604 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
605
606 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
607 prettyPrint=False, indentLevel=0):
608 """Returns a string or Unicode representation of this tag and
609 its contents. To get Unicode, pass None for encoding.
610
611 NOTE: since Python's HTML parser consumes whitespace, this
612 method is not certain to reproduce the whitespace present in
613 the original string."""
614
615 encodedName = self.toEncoding(self.name, encoding)
616
617 attrs = []
618 if self.attrs:
619 for key, val in self.attrs:
620 fmt = '%s="%s"'
621 if isString(val):
622 if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
623 val = self.substituteEncoding(val, encoding)
624
625 # The attribute value either:
626 #
627 # * Contains no embedded double quotes or single quotes.
628 # No problem: we enclose it in double quotes.
629 # * Contains embedded single quotes. No problem:
630 # double quotes work here too.
631 # * Contains embedded double quotes. No problem:
632 # we enclose it in single quotes.
633 # * Embeds both single _and_ double quotes. This
634 # can't happen naturally, but it can happen if
635 # you modify an attribute value after parsing
636 # the document. Now we have a bit of a
637 # problem. We solve it by enclosing the
638 # attribute in single quotes, and escaping any
639 # embedded single quotes to XML entities.
640 if '"' in val:
641 fmt = "%s='%s'"
642 if "'" in val:
643 # TODO: replace with apos when
644 # appropriate.
645 val = val.replace("'", "&squot;")
646
647 # Now we're okay w/r/t quotes. But the attribute
648 # value might also contain angle brackets, or
649 # ampersands that aren't part of entities. We need
650 # to escape those to XML entities too.
651 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
652
653 attrs.append(fmt % (self.toEncoding(key, encoding),
654 self.toEncoding(val, encoding)))
655 close = ''
656 closeTag = ''
657 if self.isSelfClosing:
658 close = ' /'
659 else:
660 closeTag = '</%s>' % encodedName
661
662 indentTag, indentContents = 0, 0
663 if prettyPrint:
664 indentTag = indentLevel
665 space = (' ' * (indentTag-1))
666 indentContents = indentTag + 1
667 contents = self.renderContents(encoding, prettyPrint, indentContents)
668 if self.hidden:
669 s = contents
670 else:
671 s = []
672 attributeString = ''
673 if attrs:
674 attributeString = ' ' + ' '.join(attrs)
675 if prettyPrint:
676 s.append(space)
677 s.append('<%s%s%s>' % (encodedName, attributeString, close))
678 if prettyPrint:
679 s.append("\n")
680 s.append(contents)
681 if prettyPrint and contents and contents[-1] != "\n":
682 s.append("\n")
683 if prettyPrint and closeTag:
684 s.append(space)
685 s.append(closeTag)
686 if prettyPrint and closeTag and self.nextSibling:
687 s.append("\n")
688 s = ''.join(s)
689 return s
690
691 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
692 return self.__str__(encoding, True)
693
694 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
695 prettyPrint=False, indentLevel=0):
696 """Renders the contents of this tag as a string in the given
697 encoding. If encoding is None, returns a Unicode string.."""
698 s=[]
699 for c in self:
700 text = None
701 if isinstance(c, NavigableString):
702 text = c.__str__(encoding)
703 elif isinstance(c, Tag):
704 s.append(c.__str__(encoding, prettyPrint, indentLevel))
705 if text and prettyPrint:
706 text = text.strip()
707 if text:
708 if prettyPrint:
709 s.append(" " * (indentLevel-1))
710 s.append(text)
711 if prettyPrint:
712 s.append("\n")
713 return ''.join(s)
714
715 #Soup methods
716
717 def find(self, name=None, attrs={}, recursive=True, text=None,
718 **kwargs):
719 """Return only the first child of this Tag matching the given
720 criteria."""
721 r = None
722 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
723 if l:
724 r = l[0]
725 return r
726 findChild = find
727
728 def findAll(self, name=None, attrs={}, recursive=True, text=None,
729 limit=None, **kwargs):
730 """Extracts a list of Tag objects that match the given
731 criteria. You can specify the name of the Tag and any
732 attributes you want the Tag to have.
733
734 The value of a key-value pair in the 'attrs' map can be a
735 string, a list of strings, a regular expression object, or a
736 callable that takes a string and returns whether or not the
737 string matches for some custom definition of 'matches'. The
738 same is true of the tag name."""
739 generator = self.recursiveChildGenerator
740 if not recursive:
741 generator = self.childGenerator
742 return self._findAll(name, attrs, text, limit, generator, **kwargs)
743 findChildren = findAll
744
745 # Pre-3.x compatibility methods
746 first = find
747 fetch = findAll
748
749 def fetchText(self, text=None, recursive=True, limit=None):
750 return self.findAll(text=text, recursive=recursive, limit=limit)
751
752 def firstText(self, text=None, recursive=True):
753 return self.find(text=text, recursive=recursive)
754
755 #Private methods
756
757 def _getAttrMap(self):
758 """Initializes a map representation of this tag's attributes,
759 if not already initialized."""
760 if not getattr(self, 'attrMap'):
761 self.attrMap = {}
762 for (key, value) in self.attrs:
763 self.attrMap[key] = value
764 return self.attrMap
765
766 #Generator methods
767 def childGenerator(self):
768 for i in range(0, len(self.contents)):
769 yield self.contents[i]
770 raise StopIteration
771
772 def recursiveChildGenerator(self):
773 stack = [(self, 0)]
774 while stack:
775 tag, start = stack.pop()
776 if isinstance(tag, Tag):
777 for i in range(start, len(tag.contents)):
778 a = tag.contents[i]
779 yield a
780 if isinstance(a, Tag) and tag.contents:
781 if i < len(tag.contents) - 1:
782 stack.append((tag, i+1))
783 stack.append((a, 0))
784 break
785 raise StopIteration
786
787# Next, a couple classes to represent queries and their results.
788class SoupStrainer:
789 """Encapsulates a number of ways of matching a markup element (tag or
790 text)."""
791
792 def __init__(self, name=None, attrs={}, text=None, **kwargs):
793 self.name = name
794 if isString(attrs):
795 kwargs['class'] = attrs
796 attrs = None
797 if kwargs:
798 if attrs:
799 attrs = attrs.copy()
800 attrs.update(kwargs)
801 else:
802 attrs = kwargs
803 self.attrs = attrs
804 self.text = text
805
806 def __str__(self):
807 if self.text:
808 return self.text
809 else:
810 return "%s|%s" % (self.name, self.attrs)
811
812 def searchTag(self, markupName=None, markupAttrs={}):
813 found = None
814 markup = None
815 if isinstance(markupName, Tag):
816 markup = markupName
817 markupAttrs = markup
818 callFunctionWithTagData = callable(self.name) \
819 and not isinstance(markupName, Tag)
820
821 if (not self.name) \
822 or callFunctionWithTagData \
823 or (markup and self._matches(markup, self.name)) \
824 or (not markup and self._matches(markupName, self.name)):
825 if callFunctionWithTagData:
826 match = self.name(markupName, markupAttrs)
827 else:
828 match = True
829 markupAttrMap = None
830 for attr, matchAgainst in self.attrs.items():
831 if not markupAttrMap:
832 if hasattr(markupAttrs, 'get'):
833 markupAttrMap = markupAttrs
834 else:
835 markupAttrMap = {}
836 for k,v in markupAttrs:
837 markupAttrMap[k] = v
838 attrValue = markupAttrMap.get(attr)
839 if not self._matches(attrValue, matchAgainst):
840 match = False
841 break
842 if match:
843 if markup:
844 found = markup
845 else:
846 found = markupName
847 return found
848
849 def search(self, markup):
850 #print 'looking for %s in %s' % (self, markup)
851 found = None
852 # If given a list of items, scan it for a text element that
853 # matches.
854 if isList(markup) and not isinstance(markup, Tag):
855 for element in markup:
856 if isinstance(element, NavigableString) \
857 and self.search(element):
858 found = element
859 break
860 # If it's a Tag, make sure its name or attributes match.
861 # Don't bother with Tags if we're searching for text.
862 elif isinstance(markup, Tag):
863 if not self.text:
864 found = self.searchTag(markup)
865 # If it's text, make sure the text matches.
866 elif isinstance(markup, NavigableString) or \
867 isString(markup):
868 if self._matches(markup, self.text):
869 found = markup
870 else:
871 raise Exception, "I don't know how to match against a %s" \
872 % markup.__class__
873 return found
874
875 def _matches(self, markup, matchAgainst):
876 #print "Matching %s against %s" % (markup, matchAgainst)
877 result = False
878 if matchAgainst == True and type(matchAgainst) == types.BooleanType:
879 result = markup != None
880 elif callable(matchAgainst):
881 result = matchAgainst(markup)
882 else:
883 #Custom match methods take the tag as an argument, but all
884 #other ways of matching match the tag name as a string.
885 if isinstance(markup, Tag):
886 markup = markup.name
887 if markup and not isString(markup):
888 markup = unicode(markup)
889 #Now we know that chunk is either a string, or None.
890 if hasattr(matchAgainst, 'match'):
891 # It's a regexp object.
892 result = markup and matchAgainst.search(markup)
893 elif isList(matchAgainst):
894 result = markup in matchAgainst
895 elif hasattr(matchAgainst, 'items'):
896 result = markup.has_key(matchAgainst)
897 elif matchAgainst and isString(markup):
898 if isinstance(markup, unicode):
899 matchAgainst = unicode(matchAgainst)
900 else:
901 matchAgainst = str(matchAgainst)
902
903 if not result:
904 result = matchAgainst == markup
905 return result
906
907class ResultSet(list):
908 """A ResultSet is just a list that keeps track of the SoupStrainer
909 that created it."""
910 def __init__(self, source):
911 list.__init__([])
912 self.source = source
913
914# Now, some helper functions.
915
916def isList(l):
917 """Convenience method that works with all 2.x versions of Python
918 to determine whether or not something is listlike."""
919 return hasattr(l, '__iter__') \
920 or (type(l) in (types.ListType, types.TupleType))
921
922def isString(s):
923 """Convenience method that works with all 2.x versions of Python
924 to determine whether or not something is stringlike."""
925 try:
926 return isinstance(s, unicode) or isinstance(s, basestring)
927 except NameError:
928 return isinstance(s, str)
929
930def buildTagMap(default, *args):
931 """Turns a list of maps, lists, or scalars into a single map.
932 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
933 NESTING_RESET_TAGS maps out of lists and partial maps."""
934 built = {}
935 for portion in args:
936 if hasattr(portion, 'items'):
937 #It's a map. Merge it.
938 for k,v in portion.items():
939 built[k] = v
940 elif isList(portion):
941 #It's a list. Map each item to the default.
942 for k in portion:
943 built[k] = default
944 else:
945 #It's a scalar. Map it to the default.
946 built[portion] = default
947 return built
948
949# Now, the parser classes.
950
951class BeautifulStoneSoup(Tag, SGMLParser):
952
953 """This class contains the basic parser and search code. It defines
954 a parser that knows nothing about tag behavior except for the
955 following:
956
957 You can't close a tag without closing all the tags it encloses.
958 That is, "<foo><bar></foo>" actually means
959 "<foo><bar></bar></foo>".
960
961 [Another possible explanation is "<foo><bar /></foo>", but since
962 this class defines no SELF_CLOSING_TAGS, it will never use that
963 explanation.]
964
965 This class is useful for parsing XML or made-up markup languages,
966 or when BeautifulSoup makes an assumption counter to what you were
967 expecting."""
968
969 SELF_CLOSING_TAGS = {}
970 NESTABLE_TAGS = {}
971 RESET_NESTING_TAGS = {}
972 QUOTE_TAGS = {}
973
974 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
975 lambda x: x.group(1) + ' />'),
976 (re.compile('<!\s+([^<>]*)>'),
977 lambda x: '<!' + x.group(1) + '>')
978 ]
979
980 ROOT_TAG_NAME = u'[document]'
981
982 HTML_ENTITIES = "html"
983 XML_ENTITIES = "xml"
984 XHTML_ENTITIES = "xhtml"
985 # TODO: This only exists for backwards-compatibility
986 ALL_ENTITIES = XHTML_ENTITIES
987
988 # Used when determining whether a text node is all whitespace and
989 # can be replaced with a single space. A text node that contains
990 # fancy Unicode spaces (usually non-breaking) should be left
991 # alone.
992 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
993
994 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
995 markupMassage=True, smartQuotesTo=XML_ENTITIES,
996 convertEntities=None, selfClosingTags=None):
997 """The Soup object is initialized as the 'root tag', and the
998 provided markup (which can be a string or a file-like object)
999 is fed into the underlying parser.
1000
1001 sgmllib will process most bad HTML, and the BeautifulSoup
1002 class has some tricks for dealing with some HTML that kills
1003 sgmllib, but Beautiful Soup can nonetheless choke or lose data
1004 if your data uses self-closing tags or declarations
1005 incorrectly.
1006
1007 By default, Beautiful Soup uses regexes to sanitize input,
1008 avoiding the vast majority of these problems. If the problems
1009 don't apply to you, pass in False for markupMassage, and
1010 you'll get better performance.
1011
1012 The default parser massage techniques fix the two most common
1013 instances of invalid HTML that choke sgmllib:
1014
1015 <br/> (No space between name of closing tag and tag close)
1016 <! --Comment--> (Extraneous whitespace in declaration)
1017
1018 You can pass in a custom list of (RE object, replace method)
1019 tuples to get Beautiful Soup to scrub your input the way you
1020 want."""
1021
1022 self.parseOnlyThese = parseOnlyThese
1023 self.fromEncoding = fromEncoding
1024 self.smartQuotesTo = smartQuotesTo
1025 self.convertEntities = convertEntities
1026 # Set the rules for how we'll deal with the entities we
1027 # encounter
1028 if self.convertEntities:
1029 # It doesn't make sense to convert encoded characters to
1030 # entities even while you're converting entities to Unicode.
1031 # Just convert it all to Unicode.
1032 self.smartQuotesTo = None
1033 if convertEntities == self.HTML_ENTITIES:
1034 self.convertXMLEntities = False
1035 self.convertHTMLEntities = True
1036 self.escapeUnrecognizedEntities = True
1037 elif convertEntities == self.XHTML_ENTITIES:
1038 self.convertXMLEntities = True
1039 self.convertHTMLEntities = True
1040 self.escapeUnrecognizedEntities = False
1041 elif convertEntities == self.XML_ENTITIES:
1042 self.convertXMLEntities = True
1043 self.convertHTMLEntities = False
1044 self.escapeUnrecognizedEntities = False
1045 else:
1046 self.convertXMLEntities = False
1047 self.convertHTMLEntities = False
1048 self.escapeUnrecognizedEntities = False
1049
1050 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1051 SGMLParser.__init__(self)
1052
1053 if hasattr(markup, 'read'): # It's a file-type object.
1054 markup = markup.read()
1055 self.markup = markup
1056 self.markupMassage = markupMassage
1057 try:
1058 self._feed()
1059 except StopParsing:
1060 pass
1061 self.markup = None # The markup can now be GCed
1062
1063 def convert_charref(self, name):
1064 """This method fixes a bug in Python's SGMLParser."""
1065 try:
1066 n = int(name)
1067 except ValueError:
1068 return
1069 if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1070 return
1071 return self.convert_codepoint(n)
1072
1073 def _feed(self, inDocumentEncoding=None):
1074 # Convert the document to Unicode.
1075 markup = self.markup
1076 if isinstance(markup, unicode):
1077 if not hasattr(self, 'originalEncoding'):
1078 self.originalEncoding = None
1079 else:
1080 dammit = UnicodeDammit\
1081 (markup, [self.fromEncoding, inDocumentEncoding],
1082 smartQuotesTo=self.smartQuotesTo)
1083 markup = dammit.unicode
1084 self.originalEncoding = dammit.originalEncoding
1085 if markup:
1086 if self.markupMassage:
1087 if not isList(self.markupMassage):
1088 self.markupMassage = self.MARKUP_MASSAGE
1089 for fix, m in self.markupMassage:
1090 markup = fix.sub(m, markup)
1091 # TODO: We get rid of markupMassage so that the
1092 # soup object can be deepcopied later on. Some
1093 # Python installations can't copy regexes. If anyone
1094 # was relying on the existence of markupMassage, this
1095 # might cause problems.
1096 del(self.markupMassage)
1097 self.reset()
1098
1099 SGMLParser.feed(self, markup)
1100 # Close out any unfinished strings and close all the open tags.
1101 self.endData()
1102 while self.currentTag.name != self.ROOT_TAG_NAME:
1103 self.popTag()
1104
1105 def __getattr__(self, methodName):
1106 """This method routes method call requests to either the SGMLParser
1107 superclass or the Tag superclass, depending on the method name."""
1108 #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1109
1110 if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1111 or methodName.find('do_') == 0:
1112 return SGMLParser.__getattr__(self, methodName)
1113 elif methodName.find('__') != 0:
1114 return Tag.__getattr__(self, methodName)
1115 else:
1116 raise AttributeError
1117
1118 def isSelfClosingTag(self, name):
1119 """Returns true iff the given string is the name of a
1120 self-closing tag according to this parser."""
1121 return self.SELF_CLOSING_TAGS.has_key(name) \
1122 or self.instanceSelfClosingTags.has_key(name)
1123
1124 def reset(self):
1125 Tag.__init__(self, self, self.ROOT_TAG_NAME)
1126 self.hidden = 1
1127 SGMLParser.reset(self)
1128 self.currentData = []
1129 self.currentTag = None
1130 self.tagStack = []
1131 self.quoteStack = []
1132 self.pushTag(self)
1133
1134 def popTag(self):
1135 tag = self.tagStack.pop()
1136 # Tags with just one string-owning child get the child as a
1137 # 'string' property, so that soup.tag.string is shorthand for
1138 # soup.tag.contents[0]
1139 if len(self.currentTag.contents) == 1 and \
1140 isinstance(self.currentTag.contents[0], NavigableString):
1141 self.currentTag.string = self.currentTag.contents[0]
1142
1143 #print "Pop", tag.name
1144 if self.tagStack:
1145 self.currentTag = self.tagStack[-1]
1146 return self.currentTag
1147
1148 def pushTag(self, tag):
1149 #print "Push", tag.name
1150 if self.currentTag:
1151 self.currentTag.contents.append(tag)
1152 self.tagStack.append(tag)
1153 self.currentTag = self.tagStack[-1]
1154
1155 def endData(self, containerClass=NavigableString):
1156 if self.currentData:
1157 currentData = ''.join(self.currentData)
1158 if not currentData.translate(self.STRIP_ASCII_SPACES):
1159 if '\n' in currentData:
1160 currentData = '\n'
1161 else:
1162 currentData = ' '
1163 self.currentData = []
1164 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1165 (not self.parseOnlyThese.text or \
1166 not self.parseOnlyThese.search(currentData)):
1167 return
1168 o = containerClass(currentData)
1169 o.setup(self.currentTag, self.previous)
1170 if self.previous:
1171 self.previous.next = o
1172 self.previous = o
1173 self.currentTag.contents.append(o)
1174
1175
1176 def _popToTag(self, name, inclusivePop=True):
1177 """Pops the tag stack up to and including the most recent
1178 instance of the given tag. If inclusivePop is false, pops the tag
1179 stack up to but *not* including the most recent instqance of
1180 the given tag."""
1181 #print "Popping to %s" % name
1182 if name == self.ROOT_TAG_NAME:
1183 return
1184
1185 numPops = 0
1186 mostRecentTag = None
1187 for i in range(len(self.tagStack)-1, 0, -1):
1188 if name == self.tagStack[i].name:
1189 numPops = len(self.tagStack)-i
1190 break
1191 if not inclusivePop:
1192 numPops = numPops - 1
1193
1194 for i in range(0, numPops):
1195 mostRecentTag = self.popTag()
1196 return mostRecentTag
1197
1198 def _smartPop(self, name):
1199
1200 """We need to pop up to the previous tag of this type, unless
1201 one of this tag's nesting reset triggers comes between this
1202 tag and the previous tag of this type, OR unless this tag is a
1203 generic nesting trigger and another generic nesting trigger
1204 comes between this tag and the previous tag of this type.
1205
1206 Examples:
1207 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1208 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1209 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1210
1211 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1212 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1213 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1214 """
1215
1216 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1217 isNestable = nestingResetTriggers != None
1218 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1219 popTo = None
1220 inclusive = True
1221 for i in range(len(self.tagStack)-1, 0, -1):
1222 p = self.tagStack[i]
1223 if (not p or p.name == name) and not isNestable:
1224 #Non-nestable tags get popped to the top or to their
1225 #last occurance.
1226 popTo = name
1227 break
1228 if (nestingResetTriggers != None
1229 and p.name in nestingResetTriggers) \
1230 or (nestingResetTriggers == None and isResetNesting
1231 and self.RESET_NESTING_TAGS.has_key(p.name)):
1232
1233 #If we encounter one of the nesting reset triggers
1234 #peculiar to this tag, or we encounter another tag
1235 #that causes nesting to reset, pop up to but not
1236 #including that tag.
1237 popTo = p.name
1238 inclusive = False
1239 break
1240 p = p.parent
1241 if popTo:
1242 self._popToTag(popTo, inclusive)
1243
1244 def unknown_starttag(self, name, attrs, selfClosing=0):
1245 #print "Start tag %s: %s" % (name, attrs)
1246 if self.quoteStack:
1247 #This is not a real tag.
1248 #print "<%s> is not real!" % name
1249 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1250 self.handle_data('<%s%s>' % (name, attrs))
1251 return
1252 self.endData()
1253
1254 if not self.isSelfClosingTag(name) and not selfClosing:
1255 self._smartPop(name)
1256
1257 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1258 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1259 return
1260
1261 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1262 if self.previous:
1263 self.previous.next = tag
1264 self.previous = tag
1265 self.pushTag(tag)
1266 if selfClosing or self.isSelfClosingTag(name):
1267 self.popTag()
1268 if name in self.QUOTE_TAGS:
1269 #print "Beginning quote (%s)" % name
1270 self.quoteStack.append(name)
1271 self.literal = 1
1272 return tag
1273
1274 def unknown_endtag(self, name):
1275 #print "End tag %s" % name
1276 if self.quoteStack and self.quoteStack[-1] != name:
1277 #This is not a real end tag.
1278 #print "</%s> is not real!" % name
1279 self.handle_data('</%s>' % name)
1280 return
1281 self.endData()
1282 self._popToTag(name)
1283 if self.quoteStack and self.quoteStack[-1] == name:
1284 self.quoteStack.pop()
1285 self.literal = (len(self.quoteStack) > 0)
1286
1287 def handle_data(self, data):
1288 self.currentData.append(data)
1289
1290 def _toStringSubclass(self, text, subclass):
1291 """Adds a certain piece of text to the tree as a NavigableString
1292 subclass."""
1293 self.endData()
1294 self.handle_data(text)
1295 self.endData(subclass)
1296
1297 def handle_pi(self, text):
1298 """Handle a processing instruction as a ProcessingInstruction
1299 object, possibly one with a %SOUP-ENCODING% slot into which an
1300 encoding will be plugged later."""
1301 if text[:3] == "xml":
1302 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1303 self._toStringSubclass(text, ProcessingInstruction)
1304
1305 def handle_comment(self, text):
1306 "Handle comments as Comment objects."
1307 self._toStringSubclass(text, Comment)
1308
1309 def handle_charref(self, ref):
1310 "Handle character references as data."
1311 if self.convertEntities:
1312 data = unichr(int(ref))
1313 else:
1314 data = '&#%s;' % ref
1315 self.handle_data(data)
1316
1317 def handle_entityref(self, ref):
1318 """Handle entity references as data, possibly converting known
1319 HTML and/or XML entity references to the corresponding Unicode
1320 characters."""
1321 data = None
1322 if self.convertHTMLEntities:
1323 try:
1324 data = unichr(name2codepoint[ref])
1325 except KeyError:
1326 pass
1327
1328 if not data and self.convertXMLEntities:
1329 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1330
1331 if not data and self.convertHTMLEntities and \
1332 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1333 # TODO: We've got a problem here. We're told this is
1334 # an entity reference, but it's not an XML entity
1335 # reference or an HTML entity reference. Nonetheless,
1336 # the logical thing to do is to pass it through as an
1337 # unrecognized entity reference.
1338 #
1339 # Except: when the input is "&carol;" this function
1340 # will be called with input "carol". When the input is
1341 # "AT&T", this function will be called with input
1342 # "T". We have no way of knowing whether a semicolon
1343 # was present originally, so we don't know whether
1344 # this is an unknown entity or just a misplaced
1345 # ampersand.
1346 #
1347 # The more common case is a misplaced ampersand, so I
1348 # escape the ampersand and omit the trailing semicolon.
1349 data = "&amp;%s" % ref
1350 if not data:
1351 # This case is different from the one above, because we
1352 # haven't already gone through a supposedly comprehensive
1353 # mapping of entities to Unicode characters. We might not
1354 # have gone through any mapping at all. So the chances are
1355 # very high that this is a real entity, and not a
1356 # misplaced ampersand.
1357 data = "&%s;" % ref
1358 self.handle_data(data)
1359
1360 def handle_decl(self, data):
1361 "Handle DOCTYPEs and the like as Declaration objects."
1362 self._toStringSubclass(data, Declaration)
1363
1364 def parse_declaration(self, i):
1365 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1366 declaration as a CData object."""
1367 j = None
1368 if self.rawdata[i:i+9] == '<![CDATA[':
1369 k = self.rawdata.find(']]>', i)
1370 if k == -1:
1371 k = len(self.rawdata)
1372 data = self.rawdata[i+9:k]
1373 j = k+3
1374 self._toStringSubclass(data, CData)
1375 else:
1376 try:
1377 j = SGMLParser.parse_declaration(self, i)
1378 except SGMLParseError:
1379 toHandle = self.rawdata[i:]
1380 self.handle_data(toHandle)
1381 j = i + len(toHandle)
1382 return j
1383
1384class BeautifulSoup(BeautifulStoneSoup):
1385
1386 """This parser knows the following facts about HTML:
1387
1388 * Some tags have no closing tag and should be interpreted as being
1389 closed as soon as they are encountered.
1390
1391 * The text inside some tags (ie. 'script') may contain tags which
1392 are not really part of the document and which should be parsed
1393 as text, not tags. If you want to parse the text as tags, you can
1394 always fetch it and parse it explicitly.
1395
1396 * Tag nesting rules:
1397
1398 Most tags can't be nested at all. For instance, the occurance of
1399 a <p> tag should implicitly close the previous <p> tag.
1400
1401 <p>Para1<p>Para2
1402 should be transformed into:
1403 <p>Para1</p><p>Para2
1404
1405 Some tags can be nested arbitrarily. For instance, the occurance
1406 of a <blockquote> tag should _not_ implicitly close the previous
1407 <blockquote> tag.
1408
1409 Alice said: <blockquote>Bob said: <blockquote>Blah
1410 should NOT be transformed into:
1411 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1412
1413 Some tags can be nested, but the nesting is reset by the
1414 interposition of other tags. For instance, a <tr> tag should
1415 implicitly close the previous <tr> tag within the same <table>,
1416 but not close a <tr> tag in another table.
1417
1418 <table><tr>Blah<tr>Blah
1419 should be transformed into:
1420 <table><tr>Blah</tr><tr>Blah
1421 but,
1422 <tr>Blah<table><tr>Blah
1423 should NOT be transformed into
1424 <tr>Blah<table></tr><tr>Blah
1425
1426 Differing assumptions about tag nesting rules are a major source
1427 of problems with the BeautifulSoup class. If BeautifulSoup is not
1428 treating as nestable a tag your page author treats as nestable,
1429 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1430 BeautifulStoneSoup before writing your own subclass."""
1431
1432 def __init__(self, *args, **kwargs):
1433 if not kwargs.has_key('smartQuotesTo'):
1434 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1435 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1436
1437 SELF_CLOSING_TAGS = buildTagMap(None,
1438 ['br' , 'hr', 'input', 'img', 'meta',
1439 'spacer', 'link', 'frame', 'base'])
1440
1441 QUOTE_TAGS = {'script' : None, 'textarea' : None}
1442
1443 #According to the HTML standard, each of these inline tags can
1444 #contain another tag of the same type. Furthermore, it's common
1445 #to actually use these tags this way.
1446 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1447 'center']
1448
1449 #According to the HTML standard, these block tags can contain
1450 #another tag of the same type. Furthermore, it's common
1451 #to actually use these tags this way.
1452 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1453
1454 #Lists can contain other lists, but there are restrictions.
1455 NESTABLE_LIST_TAGS = { 'ol' : [],
1456 'ul' : [],
1457 'li' : ['ul', 'ol'],
1458 'dl' : [],
1459 'dd' : ['dl'],
1460 'dt' : ['dl'] }
1461
1462 #Tables can contain other tables, but there are restrictions.
1463 NESTABLE_TABLE_TAGS = {'table' : [],
1464 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1465 'td' : ['tr'],
1466 'th' : ['tr'],
1467 'thead' : ['table'],
1468 'tbody' : ['table'],
1469 'tfoot' : ['table'],
1470 }
1471
1472 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1473
1474 #If one of these tags is encountered, all tags up to the next tag of
1475 #this type are popped.
1476 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1477 NON_NESTABLE_BLOCK_TAGS,
1478 NESTABLE_LIST_TAGS,
1479 NESTABLE_TABLE_TAGS)
1480
1481 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1482 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1483
1484 # Used to detect the charset in a META tag; see start_meta
1485 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1486
1487 def start_meta(self, attrs):
1488 """Beautiful Soup can detect a charset included in a META tag,
1489 try to convert the document to that charset, and re-parse the
1490 document from the beginning."""
1491 httpEquiv = None
1492 contentType = None
1493 contentTypeIndex = None
1494 tagNeedsEncodingSubstitution = False
1495
1496 for i in range(0, len(attrs)):
1497 key, value = attrs[i]
1498 key = key.lower()
1499 if key == 'http-equiv':
1500 httpEquiv = value
1501 elif key == 'content':
1502 contentType = value
1503 contentTypeIndex = i
1504
1505 if httpEquiv and contentType: # It's an interesting meta tag.
1506 match = self.CHARSET_RE.search(contentType)
1507 if match:
1508 if getattr(self, 'declaredHTMLEncoding') or \
1509 (self.originalEncoding == self.fromEncoding):
1510 # This is our second pass through the document, or
1511 # else an encoding was specified explicitly and it
1512 # worked. Rewrite the meta tag.
1513 newAttr = self.CHARSET_RE.sub\
1514 (lambda(match):match.group(1) +
1515 "%SOUP-ENCODING%", value)
1516 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1517 newAttr)
1518 tagNeedsEncodingSubstitution = True
1519 else:
1520 # This is our first pass through the document.
1521 # Go through it again with the new information.
1522 newCharset = match.group(3)
1523 if newCharset and newCharset != self.originalEncoding:
1524 self.declaredHTMLEncoding = newCharset
1525 self._feed(self.declaredHTMLEncoding)
1526 raise StopParsing
1527 tag = self.unknown_starttag("meta", attrs)
1528 if tag and tagNeedsEncodingSubstitution:
1529 tag.containsSubstitutions = True
1530
1531class StopParsing(Exception):
1532 pass
1533
1534class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1535
1536 """The BeautifulSoup class is oriented towards skipping over
1537 common HTML errors like unclosed tags. However, sometimes it makes
1538 errors of its own. For instance, consider this fragment:
1539
1540 <b>Foo<b>Bar</b></b>
1541
1542 This is perfectly valid (if bizarre) HTML. However, the
1543 BeautifulSoup class will implicitly close the first b tag when it
1544 encounters the second 'b'. It will think the author wrote
1545 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1546 there's no real-world reason to bold something that's already
1547 bold. When it encounters '</b></b>' it will close two more 'b'
1548 tags, for a grand total of three tags closed instead of two. This
1549 can throw off the rest of your document structure. The same is
1550 true of a number of other tags, listed below.
1551
1552 It's much more common for someone to forget to close a 'b' tag
1553 than to actually use nested 'b' tags, and the BeautifulSoup class
1554 handles the common case. This class handles the not-co-common
1555 case: where you can't believe someone wrote what they did, but
1556 it's valid HTML and BeautifulSoup screwed up by assuming it
1557 wouldn't be."""
1558
1559 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1560 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1561 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1562 'big']
1563
1564 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1565
1566 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1567 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1568 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1569
1570class MinimalSoup(BeautifulSoup):
1571 """The MinimalSoup class is for parsing HTML that contains
1572 pathologically bad markup. It makes no assumptions about tag
1573 nesting, but it does know which tags are self-closing, that
1574 <script> tags contain Javascript and should not be parsed, that
1575 META tags may contain encoding information, and so on.
1576
1577 This also makes it better for subclassing than BeautifulStoneSoup
1578 or BeautifulSoup."""
1579
1580 RESET_NESTING_TAGS = buildTagMap('noscript')
1581 NESTABLE_TAGS = {}
1582
1583class BeautifulSOAP(BeautifulStoneSoup):
1584 """This class will push a tag with only a single string child into
1585 the tag's parent as an attribute. The attribute's name is the tag
1586 name, and the value is the string child. An example should give
1587 the flavor of the change:
1588
1589 <foo><bar>baz</bar></foo>
1590 =>
1591 <foo bar="baz"><bar>baz</bar></foo>
1592
1593 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1594
1595 This is, of course, useful for scraping structures that tend to
1596 use subelements instead of attributes, such as SOAP messages. Note
1597 that it modifies its input, so don't print the modified version
1598 out.
1599
1600 I'm not sure how many people really want to use this class; let me
1601 know if you do. Mainly I like the name."""
1602
1603 def popTag(self):
1604 if len(self.tagStack) > 1:
1605 tag = self.tagStack[-1]
1606 parent = self.tagStack[-2]
1607 parent._getAttrMap()
1608 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1609 isinstance(tag.contents[0], NavigableString) and
1610 not parent.attrMap.has_key(tag.name)):
1611 parent[tag.name] = tag.contents[0]
1612 BeautifulStoneSoup.popTag(self)
1613
1614#Enterprise class names! It has come to our attention that some people
1615#think the names of the Beautiful Soup parser classes are too silly
1616#and "unprofessional" for use in enterprise screen-scraping. We feel
1617#your pain! For such-minded folk, the Beautiful Soup Consortium And
1618#All-Night Kosher Bakery recommends renaming this file to
1619#"RobustParser.py" (or, in cases of extreme enterprisiness,
1620#"RobustParserBeanInterface.class") and using the following
1621#enterprise-friendly class aliases:
1622class RobustXMLParser(BeautifulStoneSoup):
1623 pass
1624class RobustHTMLParser(BeautifulSoup):
1625 pass
1626class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1627 pass
1628class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1629 pass
1630class SimplifyingSOAPParser(BeautifulSOAP):
1631 pass
1632
1633######################################################
1634#
1635# Bonus library: Unicode, Dammit
1636#
1637# This class forces XML data into a standard format (usually to UTF-8
1638# or Unicode). It is heavily based on code from Mark Pilgrim's
1639# Universal Feed Parser. It does not rewrite the XML or HTML to
1640# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1641# (XML) and BeautifulSoup.start_meta (HTML).
1642
1643# Autodetects character encodings.
1644# Download from http://chardet.feedparser.org/
1645try:
1646 import chardet
1647# import chardet.constants
1648# chardet.constants._debug = 1
1649except:
1650 chardet = None
1651chardet = None
1652
1653# cjkcodecs and iconv_codec make Python know about more character encodings.
1654# Both are available from http://cjkpython.i18n.org/
1655# They're built in if you use Python 2.4.
1656try:
1657 import cjkcodecs.aliases
1658except:
1659 pass
1660try:
1661 import iconv_codec
1662except:
1663 pass
1664
1665class UnicodeDammit:
1666 """A class for detecting the encoding of a *ML document and
1667 converting it to a Unicode string. If the source encoding is
1668 windows-1252, can replace MS smart quotes with their HTML or XML
1669 equivalents."""
1670
1671 # This dictionary maps commonly seen values for "charset" in HTML
1672 # meta tags to the corresponding Python codec names. It only covers
1673 # values that aren't in Python's aliases and can't be determined
1674 # by the heuristics in find_codec.
1675 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1676 "x-sjis" : "shift-jis" }
1677
1678 def __init__(self, markup, overrideEncodings=[],
1679 smartQuotesTo='xml'):
1680 self.markup, documentEncoding, sniffedEncoding = \
1681 self._detectEncoding(markup)
1682 self.smartQuotesTo = smartQuotesTo
1683 self.triedEncodings = []
1684 if markup == '' or isinstance(markup, unicode):
1685 self.originalEncoding = None
1686 self.unicode = unicode(markup)
1687 return
1688
1689 u = None
1690 for proposedEncoding in overrideEncodings:
1691 u = self._convertFrom(proposedEncoding)
1692 if u: break
1693 if not u:
1694 for proposedEncoding in (documentEncoding, sniffedEncoding):
1695 u = self._convertFrom(proposedEncoding)
1696 if u: break
1697
1698 # If no luck and we have auto-detection library, try that:
1699 if not u and chardet and not isinstance(self.markup, unicode):
1700 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1701
1702 # As a last resort, try utf-8 and windows-1252:
1703 if not u:
1704 for proposed_encoding in ("utf-8", "windows-1252"):
1705 u = self._convertFrom(proposed_encoding)
1706 if u: break
1707 self.unicode = u
1708 if not u: self.originalEncoding = None
1709
1710 def _subMSChar(self, orig):
1711 """Changes a MS smart quote character to an XML or HTML
1712 entity."""
1713 sub = self.MS_CHARS.get(orig)
1714 if type(sub) == types.TupleType:
1715 if self.smartQuotesTo == 'xml':
1716 sub = '&#x%s;' % sub[1]
1717 else:
1718 sub = '&%s;' % sub[0]
1719 return sub
1720
1721 def _convertFrom(self, proposed):
1722 proposed = self.find_codec(proposed)
1723 if not proposed or proposed in self.triedEncodings:
1724 return None
1725 self.triedEncodings.append(proposed)
1726 markup = self.markup
1727
1728 # Convert smart quotes to HTML if coming from an encoding
1729 # that might have them.
1730 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1731 "iso-8859-1",
1732 "iso-8859-2"):
1733 markup = re.compile("([\x80-\x9f])").sub \
1734 (lambda(x): self._subMSChar(x.group(1)),
1735 markup)
1736
1737 try:
1738 # print "Trying to convert document to %s" % proposed
1739 u = self._toUnicode(markup, proposed)
1740 self.markup = u
1741 self.originalEncoding = proposed
1742 except Exception, e:
1743 # print "That didn't work!"
1744 # print e
1745 return None
1746 #print "Correct encoding: %s" % proposed
1747 return self.markup
1748
1749 def _toUnicode(self, data, encoding):
1750 '''Given a string and its encoding, decodes the string into Unicode.
1751 %encoding is a string recognized by encodings.aliases'''
1752
1753 # strip Byte Order Mark (if present)
1754 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1755 and (data[2:4] != '\x00\x00'):
1756 encoding = 'utf-16be'
1757 data = data[2:]
1758 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1759 and (data[2:4] != '\x00\x00'):
1760 encoding = 'utf-16le'
1761 data = data[2:]
1762 elif data[:3] == '\xef\xbb\xbf':
1763 encoding = 'utf-8'
1764 data = data[3:]
1765 elif data[:4] == '\x00\x00\xfe\xff':
1766 encoding = 'utf-32be'
1767 data = data[4:]
1768 elif data[:4] == '\xff\xfe\x00\x00':
1769 encoding = 'utf-32le'
1770 data = data[4:]
1771 newdata = unicode(data, encoding)
1772 return newdata
1773
1774 def _detectEncoding(self, xml_data):
1775 """Given a document, tries to detect its XML encoding."""
1776 xml_encoding = sniffed_xml_encoding = None
1777 try:
1778 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1779 # EBCDIC
1780 xml_data = self._ebcdic_to_ascii(xml_data)
1781 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1782 # UTF-16BE
1783 sniffed_xml_encoding = 'utf-16be'
1784 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1785 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1786 and (xml_data[2:4] != '\x00\x00'):
1787 # UTF-16BE with BOM
1788 sniffed_xml_encoding = 'utf-16be'
1789 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1790 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1791 # UTF-16LE
1792 sniffed_xml_encoding = 'utf-16le'
1793 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1794 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1795 (xml_data[2:4] != '\x00\x00'):
1796 # UTF-16LE with BOM
1797 sniffed_xml_encoding = 'utf-16le'
1798 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1799 elif xml_data[:4] == '\x00\x00\x00\x3c':
1800 # UTF-32BE
1801 sniffed_xml_encoding = 'utf-32be'
1802 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1803 elif xml_data[:4] == '\x3c\x00\x00\x00':
1804 # UTF-32LE
1805 sniffed_xml_encoding = 'utf-32le'
1806 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1807 elif xml_data[:4] == '\x00\x00\xfe\xff':
1808 # UTF-32BE with BOM
1809 sniffed_xml_encoding = 'utf-32be'
1810 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1811 elif xml_data[:4] == '\xff\xfe\x00\x00':
1812 # UTF-32LE with BOM
1813 sniffed_xml_encoding = 'utf-32le'
1814 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1815 elif xml_data[:3] == '\xef\xbb\xbf':
1816 # UTF-8 with BOM
1817 sniffed_xml_encoding = 'utf-8'
1818 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1819 else:
1820 sniffed_xml_encoding = 'ascii'
1821 pass
1822 xml_encoding_match = re.compile \
1823 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1824 .match(xml_data)
1825 except:
1826 xml_encoding_match = None
1827 if xml_encoding_match:
1828 xml_encoding = xml_encoding_match.groups()[0].lower()
1829 if sniffed_xml_encoding and \
1830 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1831 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1832 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1833 'utf16', 'u16')):
1834 xml_encoding = sniffed_xml_encoding
1835 return xml_data, xml_encoding, sniffed_xml_encoding
1836
1837
1838 def find_codec(self, charset):
1839 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1840 or (charset and self._codec(charset.replace("-", ""))) \
1841 or (charset and self._codec(charset.replace("-", "_"))) \
1842 or charset
1843
1844 def _codec(self, charset):
1845 if not charset: return charset
1846 codec = None
1847 try:
1848 codecs.lookup(charset)
1849 codec = charset
1850 except (LookupError, ValueError):
1851 pass
1852 return codec
1853
1854 EBCDIC_TO_ASCII_MAP = None
1855 def _ebcdic_to_ascii(self, s):
1856 c = self.__class__
1857 if not c.EBCDIC_TO_ASCII_MAP:
1858 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1859 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1860 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1861 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1862 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1863 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1864 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1865 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1866 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1867 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1868 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1869 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1870 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1871 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1872 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1873 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1874 250,251,252,253,254,255)
1875 import string
1876 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1877 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1878 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1879
1880 MS_CHARS = { '\x80' : ('euro', '20AC'),
1881 '\x81' : ' ',
1882 '\x82' : ('sbquo', '201A'),
1883 '\x83' : ('fnof', '192'),
1884 '\x84' : ('bdquo', '201E'),
1885 '\x85' : ('hellip', '2026'),
1886 '\x86' : ('dagger', '2020'),
1887 '\x87' : ('Dagger', '2021'),
1888 '\x88' : ('circ', '2C6'),
1889 '\x89' : ('permil', '2030'),
1890 '\x8A' : ('Scaron', '160'),
1891 '\x8B' : ('lsaquo', '2039'),
1892 '\x8C' : ('OElig', '152'),
1893 '\x8D' : '?',
1894 '\x8E' : ('#x17D', '17D'),
1895 '\x8F' : '?',
1896 '\x90' : '?',
1897 '\x91' : ('lsquo', '2018'),
1898 '\x92' : ('rsquo', '2019'),
1899 '\x93' : ('ldquo', '201C'),
1900 '\x94' : ('rdquo', '201D'),
1901 '\x95' : ('bull', '2022'),
1902 '\x96' : ('ndash', '2013'),
1903 '\x97' : ('mdash', '2014'),
1904 '\x98' : ('tilde', '2DC'),
1905 '\x99' : ('trade', '2122'),
1906 '\x9a' : ('scaron', '161'),
1907 '\x9b' : ('rsaquo', '203A'),
1908 '\x9c' : ('oelig', '153'),
1909 '\x9d' : '?',
1910 '\x9e' : ('#x17E', '17E'),
1911 '\x9f' : ('Yuml', ''),}
1912
1913#######################################################################
1914
1915
1916#By default, act as an HTML pretty-printer.
1917if __name__ == '__main__':
1918 import sys
1919 soup = BeautifulSoup(sys.stdin.read())
1920 print soup.prettify()
Note: See TracBrowser for help on using the repository browser.