Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

BeautifulSoup.py@ 31803

Last change on this file since 31803 was 25141, checked in by papitha, 12 years ago
NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!
File size: 74.2 KB

Line
1	"""Beautiful Soup
2	Elixir and Tonic
3	"The Screen-Scraper's Friend"
4	http://www.crummy.com/software/BeautifulSoup/
5
6	Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7	tree representation. It provides methods and Pythonic idioms that make
8	it easy to navigate, search, and modify the tree.
9
10	A well-formed XML/HTML document yields a well-formed data
11	structure. An ill-formed XML/HTML document yields a correspondingly
12	ill-formed data structure. If your document is only locally
13	well-formed, you can use this library to find and process the
14	well-formed part of it.
15
16	Beautiful Soup works with Python 2.2 and up. It has no external
17	dependencies, but you'll have more success at converting data to UTF-8
18	if you also install these three packages:
19
20	* chardet, for auto-detecting character encodings
21	http://chardet.feedparser.org/
22	* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23	by stock Python.
24	http://cjkpython.i18n.org/
25
26	Beautiful Soup defines classes for two main parsing strategies:
27
28	* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29	language that kind of looks like XML.
30
31	* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32	or invalid. This class has web browser-like heuristics for
33	obtaining a sensible parse tree in the face of common HTML errors.
34
35	Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36	the encoding of an HTML or XML document, and converting it to
37	Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39	For more than you ever wanted to know about Beautiful Soup, see the
40	documentation:
41	http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43	Here, have some legalese:
44
45	Copyright (c) 2004-2007, Leonard Richardson
46
47	All rights reserved.
48
49	Redistribution and use in source and binary forms, with or without
50	modification, are permitted provided that the following conditions are
51	met:
52
53	* Redistributions of source code must retain the above copyright
54	notice, this list of conditions and the following disclaimer.
55
56	* Redistributions in binary form must reproduce the above
57	copyright notice, this list of conditions and the following
58	disclaimer in the documentation and/or other materials provided
59	with the distribution.
60
61	* Neither the name of the the Beautiful Soup Consortium and All
62	Night Kosher Bakery nor the names of its contributors may be
63	used to endorse or promote products derived from this software
64	without specific prior written permission.
65
66	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70	CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71	EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72	PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73	PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74	LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
78	"""
79	from __future__ import generators
80
81	__author__ = "Leonard Richardson ([email protected])"
82	__version__ = "3.0.5"
83	__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
84	__license__ = "New-style BSD"
85
86	from sgmllib import SGMLParser, SGMLParseError
87	import codecs
88	import types
89	import re
90	import sgmllib
91	try:
92	from htmlentitydefs import name2codepoint
93	except ImportError:
94	name2codepoint = {}
95
96	#This hack makes Beautiful Soup able to parse XML with namespaces
97	sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
98
99	DEFAULT_OUTPUT_ENCODING = "utf-8"
100
101	# First, the classes that represent markup elements.
102
103	class PageElement:
104	"""Contains the navigational information for some part of the page
105	(either a tag or a piece of text)"""
106
107	def setup(self, parent=None, previous=None):
108	"""Sets up the initial relations between this element and
109	other elements."""
110	self.parent = parent
111	self.previous = previous
112	self.next = None
113	self.previousSibling = None
114	self.nextSibling = None
115	if self.parent and self.parent.contents:
116	self.previousSibling = self.parent.contents[-1]
117	self.previousSibling.nextSibling = self
118
119	def replaceWith(self, replaceWith):
120	oldParent = self.parent
121	myIndex = self.parent.contents.index(self)
122	if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
123	# We're replacing this element with one of its siblings.
124	index = self.parent.contents.index(replaceWith)
125	if index and index < myIndex:
126	# Furthermore, it comes before this element. That
127	# means that when we extract it, the index of this
128	# element will change.
129	myIndex = myIndex - 1
130	self.extract()
131	oldParent.insert(myIndex, replaceWith)
132
133	def extract(self):
134	"""Destructively rips this element out of the tree."""
135	if self.parent:
136	try:
137	self.parent.contents.remove(self)
138	except ValueError:
139	pass
140
141	#Find the two elements that would be next to each other if
142	#this element (and any children) hadn't been parsed. Connect
143	#the two.
144	lastChild = self._lastRecursiveChild()
145	nextElement = lastChild.next
146
147	if self.previous:
148	self.previous.next = nextElement
149	if nextElement:
150	nextElement.previous = self.previous
151	self.previous = None
152	lastChild.next = None
153
154	self.parent = None
155	if self.previousSibling:
156	self.previousSibling.nextSibling = self.nextSibling
157	if self.nextSibling:
158	self.nextSibling.previousSibling = self.previousSibling
159	self.previousSibling = self.nextSibling = None
160
161	def _lastRecursiveChild(self):
162	"Finds the last element beneath this object to be parsed."
163	lastChild = self
164	while hasattr(lastChild, 'contents') and lastChild.contents:
165	lastChild = lastChild.contents[-1]
166	return lastChild
167
168	def insert(self, position, newChild):
169	if (isinstance(newChild, basestring)
170	or isinstance(newChild, unicode)) \
171	and not isinstance(newChild, NavigableString):
172	newChild = NavigableString(newChild)
173
174	position = min(position, len(self.contents))
175	if hasattr(newChild, 'parent') and newChild.parent != None:
176	# We're 'inserting' an element that's already one
177	# of this object's children.
178	if newChild.parent == self:
179	index = self.find(newChild)
180	if index and index < position:
181	# Furthermore we're moving it further down the
182	# list of this object's children. That means that
183	# when we extract this element, our target index
184	# will jump down one.
185	position = position - 1
186	newChild.extract()
187
188	newChild.parent = self
189	previousChild = None
190	if position == 0:
191	newChild.previousSibling = None
192	newChild.previous = self
193	else:
194	previousChild = self.contents[position-1]
195	newChild.previousSibling = previousChild
196	newChild.previousSibling.nextSibling = newChild
197	newChild.previous = previousChild._lastRecursiveChild()
198	if newChild.previous:
199	newChild.previous.next = newChild
200
201	newChildsLastElement = newChild._lastRecursiveChild()
202
203	if position >= len(self.contents):
204	newChild.nextSibling = None
205
206	parent = self
207	parentsNextSibling = None
208	while not parentsNextSibling:
209	parentsNextSibling = parent.nextSibling
210	parent = parent.parent
211	if not parent: # This is the last element in the document.
212	break
213	if parentsNextSibling:
214	newChildsLastElement.next = parentsNextSibling
215	else:
216	newChildsLastElement.next = None
217	else:
218	nextChild = self.contents[position]
219	newChild.nextSibling = nextChild
220	if newChild.nextSibling:
221	newChild.nextSibling.previousSibling = newChild
222	newChildsLastElement.next = nextChild
223
224	if newChildsLastElement.next:
225	newChildsLastElement.next.previous = newChildsLastElement
226	self.contents.insert(position, newChild)
227
228	def append(self, tag):
229	"""Appends the given tag to the contents of this tag."""
230	self.insert(len(self.contents), tag)
231
232	def findNext(self, name=None, attrs={}, text=None, **kwargs):
233	"""Returns the first item that matches the given criteria and
234	appears after this Tag in the document."""
235	return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
236
237	def findAllNext(self, name=None, attrs={}, text=None, limit=None,
238	**kwargs):
239	"""Returns all items that match the given criteria and appear
240	before after Tag in the document."""
241	return self._findAll(name, attrs, text, limit, self.nextGenerator)
242
243	def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
244	"""Returns the closest sibling to this Tag that matches the
245	given criteria and appears after this Tag in the document."""
246	return self._findOne(self.findNextSiblings, name, attrs, text,
247	**kwargs)
248
249	def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
250	**kwargs):
251	"""Returns the siblings of this Tag that match the given
252	criteria and appear after this Tag in the document."""
253	return self._findAll(name, attrs, text, limit,
254	self.nextSiblingGenerator, **kwargs)
255	fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
256
257	def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
258	"""Returns the first item that matches the given criteria and
259	appears before this Tag in the document."""
260	return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
261
262	def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
263	**kwargs):
264	"""Returns all items that match the given criteria and appear
265	before this Tag in the document."""
266	return self._findAll(name, attrs, text, limit, self.previousGenerator,
267	**kwargs)
268	fetchPrevious = findAllPrevious # Compatibility with pre-3.x
269
270	def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
271	"""Returns the closest sibling to this Tag that matches the
272	given criteria and appears before this Tag in the document."""
273	return self._findOne(self.findPreviousSiblings, name, attrs, text,
274	**kwargs)
275
276	def findPreviousSiblings(self, name=None, attrs={}, text=None,
277	limit=None, **kwargs):
278	"""Returns the siblings of this Tag that match the given
279	criteria and appear before this Tag in the document."""
280	return self._findAll(name, attrs, text, limit,
281	self.previousSiblingGenerator, **kwargs)
282	fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
283
284	def findParent(self, name=None, attrs={}, **kwargs):
285	"""Returns the closest parent of this Tag that matches the given
286	criteria."""
287	# NOTE: We can't use _findOne because findParents takes a different
288	# set of arguments.
289	r = None
290	l = self.findParents(name, attrs, 1)
291	if l:
292	r = l[0]
293	return r
294
295	def findParents(self, name=None, attrs={}, limit=None, **kwargs):
296	"""Returns the parents of this Tag that match the given
297	criteria."""
298
299	return self._findAll(name, attrs, None, limit, self.parentGenerator,
300	**kwargs)
301	fetchParents = findParents # Compatibility with pre-3.x
302
303	#These methods do the real heavy lifting.
304
305	def _findOne(self, method, name, attrs, text, **kwargs):
306	r = None
307	l = method(name, attrs, text, 1, **kwargs)
308	if l:
309	r = l[0]
310	return r
311
312	def _findAll(self, name, attrs, text, limit, generator, **kwargs):
313	"Iterates over a generator looking for things that match."
314
315	if isinstance(name, SoupStrainer):
316	strainer = name
317	else:
318	# Build a SoupStrainer
319	strainer = SoupStrainer(name, attrs, text, **kwargs)
320	results = ResultSet(strainer)
321	g = generator()
322	while True:
323	try:
324	i = g.next()
325	except StopIteration:
326	break
327	if i:
328	found = strainer.search(i)
329	if found:
330	results.append(found)
331	if limit and len(results) >= limit:
332	break
333	return results
334
335	#These Generators can be used to navigate starting from both
336	#NavigableStrings and Tags.
337	def nextGenerator(self):
338	i = self
339	while i:
340	i = i.next
341	yield i
342
343	def nextSiblingGenerator(self):
344	i = self
345	while i:
346	i = i.nextSibling
347	yield i
348
349	def previousGenerator(self):
350	i = self
351	while i:
352	i = i.previous
353	yield i
354
355	def previousSiblingGenerator(self):
356	i = self
357	while i:
358	i = i.previousSibling
359	yield i
360
361	def parentGenerator(self):
362	i = self
363	while i:
364	i = i.parent
365	yield i
366
367	# Utility methods
368	def substituteEncoding(self, str, encoding=None):
369	encoding = encoding or "utf-8"
370	return str.replace("%SOUP-ENCODING%", encoding)
371
372	def toEncoding(self, s, encoding=None):
373	"""Encodes an object to a string in some encoding, or to Unicode.
374	."""
375	if isinstance(s, unicode):
376	if encoding:
377	s = s.encode(encoding)
378	elif isinstance(s, str):
379	if encoding:
380	s = s.encode(encoding)
381	else:
382	s = unicode(s)
383	else:
384	if encoding:
385	s = self.toEncoding(str(s), encoding)
386	else:
387	s = unicode(s)
388	return s
389
390	class NavigableString(unicode, PageElement):
391
392	def __getnewargs__(self):
393	return (NavigableString.__str__(self),)
394
395	def __getattr__(self, attr):
396	"""text.string gives you text. This is for backwards
397	compatibility for NavigableString, but for CData it lets you
398	get the string without the CData wrapper."""
399	if attr == 'string':
400	return self
401	else:
402	raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
403
404	def __unicode__(self):
405	return unicode(str(self))
406
407	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
408	if encoding:
409	return self.encode(encoding)
410	else:
411	return self
412
413	class CData(NavigableString):
414
415	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
416	return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
417
418	class ProcessingInstruction(NavigableString):
419	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
420	output = self
421	if "%SOUP-ENCODING%" in output:
422	output = self.substituteEncoding(output, encoding)
423	return "<?%s?>" % self.toEncoding(output, encoding)
424
425	class Comment(NavigableString):
426	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
427	return "<!--%s-->" % NavigableString.__str__(self, encoding)
428
429	class Declaration(NavigableString):
430	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
431	return "<!%s>" % NavigableString.__str__(self, encoding)
432
433	class Tag(PageElement):
434
435	"""Represents a found HTML tag with its attributes and contents."""
436
437	def _invert(h):
438	"Cheap function to invert a hash."
439	i = {}
440	for k,v in h.items():
441	i[v] = k
442	return i
443
444	XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
445	"quot" : '"',
446	"amp" : "&",
447	"lt" : "<",
448	"gt" : ">" }
449
450	XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
451
452	def _convertEntities(self, match):
453	"""Used in a call to re.sub to replace HTML, XML, and numeric
454	entities with the appropriate Unicode characters. If HTML
455	entities are being converted, any unrecognized entities are
456	escaped."""
457	x = match.group(1)
458	if self.convertHTMLEntities and x in name2codepoint:
459	return unichr(name2codepoint[x])
460	elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
461	if self.convertXMLEntities:
462	return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
463	else:
464	return u'&%s;' % x
465	elif len(x) > 0 and x[0] == '#':
466	# Handle numeric entities
467	if len(x) > 1 and x[1] == 'x':
468	return unichr(int(x[2:], 16))
469	else:
470	return unichr(int(x[1:]))
471
472	elif self.escapeUnrecognizedEntities:
473	return u'&%s;' % x
474	else:
475	return u'&%s;' % x
476
477	def __init__(self, parser, name, attrs=None, parent=None,
478	previous=None):
479	"Basic constructor."
480
481	# We don't actually store the parser object: that lets extracted
482	# chunks be garbage-collected
483	self.parserClass = parser.__class__
484	self.isSelfClosing = parser.isSelfClosingTag(name)
485	self.name = name
486	if attrs == None:
487	attrs = []
488	self.attrs = attrs
489	self.contents = []
490	self.setup(parent, previous)
491	self.hidden = False
492	self.containsSubstitutions = False
493	self.convertHTMLEntities = parser.convertHTMLEntities
494	self.convertXMLEntities = parser.convertXMLEntities
495	self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
496
497	# Convert any HTML, XML, or numeric entities in the attribute values.
498	convert = lambda(k, val): (k,
499	re.sub("&(#\d+\|#x[0-9a-fA-F]+\|\w+);",
500	self._convertEntities,
501	val))
502	self.attrs = map(convert, self.attrs)
503
504	def get(self, key, default=None):
505	"""Returns the value of the 'key' attribute for the tag, or
506	the value given for 'default' if it doesn't have that
507	attribute."""
508	return self._getAttrMap().get(key, default)
509
510	def has_key(self, key):
511	return self._getAttrMap().has_key(key)
512
513	def __getitem__(self, key):
514	"""tag[key] returns the value of the 'key' attribute for the tag,
515	and throws an exception if it's not there."""
516	return self._getAttrMap()[key]
517
518	def __iter__(self):
519	"Iterating over a tag iterates over its contents."
520	return iter(self.contents)
521
522	def __len__(self):
523	"The length of a tag is the length of its list of contents."
524	return len(self.contents)
525
526	def __contains__(self, x):
527	return x in self.contents
528
529	def __nonzero__(self):
530	"A tag is non-None even if it has no contents."
531	return True
532
533	def __setitem__(self, key, value):
534	"""Setting tag[key] sets the value of the 'key' attribute for the
535	tag."""
536	self._getAttrMap()
537	self.attrMap[key] = value
538	found = False
539	for i in range(0, len(self.attrs)):
540	if self.attrs[i][0] == key:
541	self.attrs[i] = (key, value)
542	found = True
543	if not found:
544	self.attrs.append((key, value))
545	self._getAttrMap()[key] = value
546
547	def __delitem__(self, key):
548	"Deleting tag[key] deletes all 'key' attributes for the tag."
549	for item in self.attrs:
550	if item[0] == key:
551	self.attrs.remove(item)
552	#We don't break because bad HTML can define the same
553	#attribute multiple times.
554	self._getAttrMap()
555	if self.attrMap.has_key(key):
556	del self.attrMap[key]
557
558	def __call__(self, args, *kwargs):
559	"""Calling a tag like a function is the same as calling its
560	findAll() method. Eg. tag('a') returns a list of all the A tags
561	found within this tag."""
562	return apply(self.findAll, args, kwargs)
563
564	def __getattr__(self, tag):
565	#print "Getattr %s.%s" % (self.__class__, tag)
566	if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
567	return self.find(tag[:-3])
568	elif tag.find('__') != 0:
569	return self.find(tag)
570	raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
571
572	def __eq__(self, other):
573	"""Returns true iff this tag has the same name, the same attributes,
574	and the same contents (recursively) as the given tag.
575
576	NOTE: right now this will return false if two tags have the
577	same attributes in a different order. Should this be fixed?"""
578	if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
579	return False
580	for i in range(0, len(self.contents)):
581	if self.contents[i] != other.contents[i]:
582	return False
583	return True
584
585	def __ne__(self, other):
586	"""Returns true iff this tag is not identical to the other tag,
587	as defined in __eq__."""
588	return not self == other
589
590	def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
591	"""Renders this tag as a string."""
592	return self.__str__(encoding)
593
594	def __unicode__(self):
595	return self.__str__(None)
596
597	BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]\|"
598	+ "&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)"
599	+ ")")
600
601	def _sub_entity(self, x):
602	"""Used with a regular expression to substitute the
603	appropriate XML entity for an XML special character."""
604	return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
605
606	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
607	prettyPrint=False, indentLevel=0):
608	"""Returns a string or Unicode representation of this tag and
609	its contents. To get Unicode, pass None for encoding.
610
611	NOTE: since Python's HTML parser consumes whitespace, this
612	method is not certain to reproduce the whitespace present in
613	the original string."""
614
615	encodedName = self.toEncoding(self.name, encoding)
616
617	attrs = []
618	if self.attrs:
619	for key, val in self.attrs:
620	fmt = '%s="%s"'
621	if isString(val):
622	if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
623	val = self.substituteEncoding(val, encoding)
624
625	# The attribute value either:
626	#
627	# * Contains no embedded double quotes or single quotes.
628	# No problem: we enclose it in double quotes.
629	# * Contains embedded single quotes. No problem:
630	# double quotes work here too.
631	# * Contains embedded double quotes. No problem:
632	# we enclose it in single quotes.
633	# * Embeds both single _and_ double quotes. This
634	# can't happen naturally, but it can happen if
635	# you modify an attribute value after parsing
636	# the document. Now we have a bit of a
637	# problem. We solve it by enclosing the
638	# attribute in single quotes, and escaping any
639	# embedded single quotes to XML entities.
640	if '"' in val:
641	fmt = "%s='%s'"
642	if "'" in val:
643	# TODO: replace with apos when
644	# appropriate.
645	val = val.replace("'", "&squot;")
646
647	# Now we're okay w/r/t quotes. But the attribute
648	# value might also contain angle brackets, or
649	# ampersands that aren't part of entities. We need
650	# to escape those to XML entities too.
651	val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
652
653	attrs.append(fmt % (self.toEncoding(key, encoding),
654	self.toEncoding(val, encoding)))
655	close = ''
656	closeTag = ''
657	if self.isSelfClosing:
658	close = ' /'
659	else:
660	closeTag = '</%s>' % encodedName
661
662	indentTag, indentContents = 0, 0
663	if prettyPrint:
664	indentTag = indentLevel
665	space = (' ' * (indentTag-1))
666	indentContents = indentTag + 1
667	contents = self.renderContents(encoding, prettyPrint, indentContents)
668	if self.hidden:
669	s = contents
670	else:
671	s = []
672	attributeString = ''
673	if attrs:
674	attributeString = ' ' + ' '.join(attrs)
675	if prettyPrint:
676	s.append(space)
677	s.append('<%s%s%s>' % (encodedName, attributeString, close))
678	if prettyPrint:
679	s.append("\n")
680	s.append(contents)
681	if prettyPrint and contents and contents[-1] != "\n":
682	s.append("\n")
683	if prettyPrint and closeTag:
684	s.append(space)
685	s.append(closeTag)
686	if prettyPrint and closeTag and self.nextSibling:
687	s.append("\n")
688	s = ''.join(s)
689	return s
690
691	def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
692	return self.__str__(encoding, True)
693
694	def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
695	prettyPrint=False, indentLevel=0):
696	"""Renders the contents of this tag as a string in the given
697	encoding. If encoding is None, returns a Unicode string.."""
698	s=[]
699	for c in self:
700	text = None
701	if isinstance(c, NavigableString):
702	text = c.__str__(encoding)
703	elif isinstance(c, Tag):
704	s.append(c.__str__(encoding, prettyPrint, indentLevel))
705	if text and prettyPrint:
706	text = text.strip()
707	if text:
708	if prettyPrint:
709	s.append(" " * (indentLevel-1))
710	s.append(text)
711	if prettyPrint:
712	s.append("\n")
713	return ''.join(s)
714
715	#Soup methods
716
717	def find(self, name=None, attrs={}, recursive=True, text=None,
718	**kwargs):
719	"""Return only the first child of this Tag matching the given
720	criteria."""
721	r = None
722	l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
723	if l:
724	r = l[0]
725	return r
726	findChild = find
727
728	def findAll(self, name=None, attrs={}, recursive=True, text=None,
729	limit=None, **kwargs):
730	"""Extracts a list of Tag objects that match the given
731	criteria. You can specify the name of the Tag and any
732	attributes you want the Tag to have.
733
734	The value of a key-value pair in the 'attrs' map can be a
735	string, a list of strings, a regular expression object, or a
736	callable that takes a string and returns whether or not the
737	string matches for some custom definition of 'matches'. The
738	same is true of the tag name."""
739	generator = self.recursiveChildGenerator
740	if not recursive:
741	generator = self.childGenerator
742	return self._findAll(name, attrs, text, limit, generator, **kwargs)
743	findChildren = findAll
744
745	# Pre-3.x compatibility methods
746	first = find
747	fetch = findAll
748
749	def fetchText(self, text=None, recursive=True, limit=None):
750	return self.findAll(text=text, recursive=recursive, limit=limit)
751
752	def firstText(self, text=None, recursive=True):
753	return self.find(text=text, recursive=recursive)
754
755	#Private methods
756
757	def _getAttrMap(self):
758	"""Initializes a map representation of this tag's attributes,
759	if not already initialized."""
760	if not getattr(self, 'attrMap'):
761	self.attrMap = {}
762	for (key, value) in self.attrs:
763	self.attrMap[key] = value
764	return self.attrMap
765
766	#Generator methods
767	def childGenerator(self):
768	for i in range(0, len(self.contents)):
769	yield self.contents[i]
770	raise StopIteration
771
772	def recursiveChildGenerator(self):
773	stack = [(self, 0)]
774	while stack:
775	tag, start = stack.pop()
776	if isinstance(tag, Tag):
777	for i in range(start, len(tag.contents)):
778	a = tag.contents[i]
779	yield a
780	if isinstance(a, Tag) and tag.contents:
781	if i < len(tag.contents) - 1:
782	stack.append((tag, i+1))
783	stack.append((a, 0))
784	break
785	raise StopIteration
786
787	# Next, a couple classes to represent queries and their results.
788	class SoupStrainer:
789	"""Encapsulates a number of ways of matching a markup element (tag or
790	text)."""
791
792	def __init__(self, name=None, attrs={}, text=None, **kwargs):
793	self.name = name
794	if isString(attrs):
795	kwargs['class'] = attrs
796	attrs = None
797	if kwargs:
798	if attrs:
799	attrs = attrs.copy()
800	attrs.update(kwargs)
801	else:
802	attrs = kwargs
803	self.attrs = attrs
804	self.text = text
805
806	def __str__(self):
807	if self.text:
808	return self.text
809	else:
810	return "%s\|%s" % (self.name, self.attrs)
811
812	def searchTag(self, markupName=None, markupAttrs={}):
813	found = None
814	markup = None
815	if isinstance(markupName, Tag):
816	markup = markupName
817	markupAttrs = markup
818	callFunctionWithTagData = callable(self.name) \
819	and not isinstance(markupName, Tag)
820
821	if (not self.name) \
822	or callFunctionWithTagData \
823	or (markup and self._matches(markup, self.name)) \
824	or (not markup and self._matches(markupName, self.name)):
825	if callFunctionWithTagData:
826	match = self.name(markupName, markupAttrs)
827	else:
828	match = True
829	markupAttrMap = None
830	for attr, matchAgainst in self.attrs.items():
831	if not markupAttrMap:
832	if hasattr(markupAttrs, 'get'):
833	markupAttrMap = markupAttrs
834	else:
835	markupAttrMap = {}
836	for k,v in markupAttrs:
837	markupAttrMap[k] = v
838	attrValue = markupAttrMap.get(attr)
839	if not self._matches(attrValue, matchAgainst):
840	match = False
841	break
842	if match:
843	if markup:
844	found = markup
845	else:
846	found = markupName
847	return found
848
849	def search(self, markup):
850	#print 'looking for %s in %s' % (self, markup)
851	found = None
852	# If given a list of items, scan it for a text element that
853	# matches.
854	if isList(markup) and not isinstance(markup, Tag):
855	for element in markup:
856	if isinstance(element, NavigableString) \
857	and self.search(element):
858	found = element
859	break
860	# If it's a Tag, make sure its name or attributes match.
861	# Don't bother with Tags if we're searching for text.
862	elif isinstance(markup, Tag):
863	if not self.text:
864	found = self.searchTag(markup)
865	# If it's text, make sure the text matches.
866	elif isinstance(markup, NavigableString) or \
867	isString(markup):
868	if self._matches(markup, self.text):
869	found = markup
870	else:
871	raise Exception, "I don't know how to match against a %s" \
872	% markup.__class__
873	return found
874
875	def _matches(self, markup, matchAgainst):
876	#print "Matching %s against %s" % (markup, matchAgainst)
877	result = False
878	if matchAgainst == True and type(matchAgainst) == types.BooleanType:
879	result = markup != None
880	elif callable(matchAgainst):
881	result = matchAgainst(markup)
882	else:
883	#Custom match methods take the tag as an argument, but all
884	#other ways of matching match the tag name as a string.
885	if isinstance(markup, Tag):
886	markup = markup.name
887	if markup and not isString(markup):
888	markup = unicode(markup)
889	#Now we know that chunk is either a string, or None.
890	if hasattr(matchAgainst, 'match'):
891	# It's a regexp object.
892	result = markup and matchAgainst.search(markup)
893	elif isList(matchAgainst):
894	result = markup in matchAgainst
895	elif hasattr(matchAgainst, 'items'):
896	result = markup.has_key(matchAgainst)
897	elif matchAgainst and isString(markup):
898	if isinstance(markup, unicode):
899	matchAgainst = unicode(matchAgainst)
900	else:
901	matchAgainst = str(matchAgainst)
902
903	if not result:
904	result = matchAgainst == markup
905	return result
906
907	class ResultSet(list):
908	"""A ResultSet is just a list that keeps track of the SoupStrainer
909	that created it."""
910	def __init__(self, source):
911	list.__init__([])
912	self.source = source
913
914	# Now, some helper functions.
915
916	def isList(l):
917	"""Convenience method that works with all 2.x versions of Python
918	to determine whether or not something is listlike."""
919	return hasattr(l, '__iter__') \
920	or (type(l) in (types.ListType, types.TupleType))
921
922	def isString(s):
923	"""Convenience method that works with all 2.x versions of Python
924	to determine whether or not something is stringlike."""
925	try:
926	return isinstance(s, unicode) or isinstance(s, basestring)
927	except NameError:
928	return isinstance(s, str)
929
930	def buildTagMap(default, *args):
931	"""Turns a list of maps, lists, or scalars into a single map.
932	Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
933	NESTING_RESET_TAGS maps out of lists and partial maps."""
934	built = {}
935	for portion in args:
936	if hasattr(portion, 'items'):
937	#It's a map. Merge it.
938	for k,v in portion.items():
939	built[k] = v
940	elif isList(portion):
941	#It's a list. Map each item to the default.
942	for k in portion:
943	built[k] = default
944	else:
945	#It's a scalar. Map it to the default.
946	built[portion] = default
947	return built
948
949	# Now, the parser classes.
950
951	class BeautifulStoneSoup(Tag, SGMLParser):
952
953	"""This class contains the basic parser and search code. It defines
954	a parser that knows nothing about tag behavior except for the
955	following:
956
957	You can't close a tag without closing all the tags it encloses.
958	That is, "<foo><bar></foo>" actually means
959	"<foo><bar></bar></foo>".
960
961	[Another possible explanation is "<foo><bar /></foo>", but since
962	this class defines no SELF_CLOSING_TAGS, it will never use that
963	explanation.]
964
965	This class is useful for parsing XML or made-up markup languages,
966	or when BeautifulSoup makes an assumption counter to what you were
967	expecting."""
968
969	SELF_CLOSING_TAGS = {}
970	NESTABLE_TAGS = {}
971	RESET_NESTING_TAGS = {}
972	QUOTE_TAGS = {}
973
974	MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
975	lambda x: x.group(1) + ' />'),
976	(re.compile('<!\s+([^<>]*)>'),
977	lambda x: '<!' + x.group(1) + '>')
978	]
979
980	ROOT_TAG_NAME = u'[document]'
981
982	HTML_ENTITIES = "html"
983	XML_ENTITIES = "xml"
984	XHTML_ENTITIES = "xhtml"
985	# TODO: This only exists for backwards-compatibility
986	ALL_ENTITIES = XHTML_ENTITIES
987
988	# Used when determining whether a text node is all whitespace and
989	# can be replaced with a single space. A text node that contains
990	# fancy Unicode spaces (usually non-breaking) should be left
991	# alone.
992	STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
993
994	def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
995	markupMassage=True, smartQuotesTo=XML_ENTITIES,
996	convertEntities=None, selfClosingTags=None):
997	"""The Soup object is initialized as the 'root tag', and the
998	provided markup (which can be a string or a file-like object)
999	is fed into the underlying parser.
1000
1001	sgmllib will process most bad HTML, and the BeautifulSoup
1002	class has some tricks for dealing with some HTML that kills
1003	sgmllib, but Beautiful Soup can nonetheless choke or lose data
1004	if your data uses self-closing tags or declarations
1005	incorrectly.
1006
1007	By default, Beautiful Soup uses regexes to sanitize input,
1008	avoiding the vast majority of these problems. If the problems
1009	don't apply to you, pass in False for markupMassage, and
1010	you'll get better performance.
1011
1012	The default parser massage techniques fix the two most common
1013	instances of invalid HTML that choke sgmllib:
1014
1015	<br/> (No space between name of closing tag and tag close)
1016	<! --Comment--> (Extraneous whitespace in declaration)
1017
1018	You can pass in a custom list of (RE object, replace method)
1019	tuples to get Beautiful Soup to scrub your input the way you
1020	want."""
1021
1022	self.parseOnlyThese = parseOnlyThese
1023	self.fromEncoding = fromEncoding
1024	self.smartQuotesTo = smartQuotesTo
1025	self.convertEntities = convertEntities
1026	# Set the rules for how we'll deal with the entities we
1027	# encounter
1028	if self.convertEntities:
1029	# It doesn't make sense to convert encoded characters to
1030	# entities even while you're converting entities to Unicode.
1031	# Just convert it all to Unicode.
1032	self.smartQuotesTo = None
1033	if convertEntities == self.HTML_ENTITIES:
1034	self.convertXMLEntities = False
1035	self.convertHTMLEntities = True
1036	self.escapeUnrecognizedEntities = True
1037	elif convertEntities == self.XHTML_ENTITIES:
1038	self.convertXMLEntities = True
1039	self.convertHTMLEntities = True
1040	self.escapeUnrecognizedEntities = False
1041	elif convertEntities == self.XML_ENTITIES:
1042	self.convertXMLEntities = True
1043	self.convertHTMLEntities = False
1044	self.escapeUnrecognizedEntities = False
1045	else:
1046	self.convertXMLEntities = False
1047	self.convertHTMLEntities = False
1048	self.escapeUnrecognizedEntities = False
1049
1050	self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1051	SGMLParser.__init__(self)
1052
1053	if hasattr(markup, 'read'): # It's a file-type object.
1054	markup = markup.read()
1055	self.markup = markup
1056	self.markupMassage = markupMassage
1057	try:
1058	self._feed()
1059	except StopParsing:
1060	pass
1061	self.markup = None # The markup can now be GCed
1062
1063	def convert_charref(self, name):
1064	"""This method fixes a bug in Python's SGMLParser."""
1065	try:
1066	n = int(name)
1067	except ValueError:
1068	return
1069	if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1070	return
1071	return self.convert_codepoint(n)
1072
1073	def _feed(self, inDocumentEncoding=None):
1074	# Convert the document to Unicode.
1075	markup = self.markup
1076	if isinstance(markup, unicode):
1077	if not hasattr(self, 'originalEncoding'):
1078	self.originalEncoding = None
1079	else:
1080	dammit = UnicodeDammit\
1081	(markup, [self.fromEncoding, inDocumentEncoding],
1082	smartQuotesTo=self.smartQuotesTo)
1083	markup = dammit.unicode
1084	self.originalEncoding = dammit.originalEncoding
1085	if markup:
1086	if self.markupMassage:
1087	if not isList(self.markupMassage):
1088	self.markupMassage = self.MARKUP_MASSAGE
1089	for fix, m in self.markupMassage:
1090	markup = fix.sub(m, markup)
1091	# TODO: We get rid of markupMassage so that the
1092	# soup object can be deepcopied later on. Some
1093	# Python installations can't copy regexes. If anyone
1094	# was relying on the existence of markupMassage, this
1095	# might cause problems.
1096	del(self.markupMassage)
1097	self.reset()
1098
1099	SGMLParser.feed(self, markup)
1100	# Close out any unfinished strings and close all the open tags.
1101	self.endData()
1102	while self.currentTag.name != self.ROOT_TAG_NAME:
1103	self.popTag()
1104
1105	def __getattr__(self, methodName):
1106	"""This method routes method call requests to either the SGMLParser
1107	superclass or the Tag superclass, depending on the method name."""
1108	#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1109
1110	if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1111	or methodName.find('do_') == 0:
1112	return SGMLParser.__getattr__(self, methodName)
1113	elif methodName.find('__') != 0:
1114	return Tag.__getattr__(self, methodName)
1115	else:
1116	raise AttributeError
1117
1118	def isSelfClosingTag(self, name):
1119	"""Returns true iff the given string is the name of a
1120	self-closing tag according to this parser."""
1121	return self.SELF_CLOSING_TAGS.has_key(name) \
1122	or self.instanceSelfClosingTags.has_key(name)
1123
1124	def reset(self):
1125	Tag.__init__(self, self, self.ROOT_TAG_NAME)
1126	self.hidden = 1
1127	SGMLParser.reset(self)
1128	self.currentData = []
1129	self.currentTag = None
1130	self.tagStack = []
1131	self.quoteStack = []
1132	self.pushTag(self)
1133
1134	def popTag(self):
1135	tag = self.tagStack.pop()
1136	# Tags with just one string-owning child get the child as a
1137	# 'string' property, so that soup.tag.string is shorthand for
1138	# soup.tag.contents[0]
1139	if len(self.currentTag.contents) == 1 and \
1140	isinstance(self.currentTag.contents[0], NavigableString):
1141	self.currentTag.string = self.currentTag.contents[0]
1142
1143	#print "Pop", tag.name
1144	if self.tagStack:
1145	self.currentTag = self.tagStack[-1]
1146	return self.currentTag
1147
1148	def pushTag(self, tag):
1149	#print "Push", tag.name
1150	if self.currentTag:
1151	self.currentTag.contents.append(tag)
1152	self.tagStack.append(tag)
1153	self.currentTag = self.tagStack[-1]
1154
1155	def endData(self, containerClass=NavigableString):
1156	if self.currentData:
1157	currentData = ''.join(self.currentData)
1158	if not currentData.translate(self.STRIP_ASCII_SPACES):
1159	if '\n' in currentData:
1160	currentData = '\n'
1161	else:
1162	currentData = ' '
1163	self.currentData = []
1164	if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1165	(not self.parseOnlyThese.text or \
1166	not self.parseOnlyThese.search(currentData)):
1167	return
1168	o = containerClass(currentData)
1169	o.setup(self.currentTag, self.previous)
1170	if self.previous:
1171	self.previous.next = o
1172	self.previous = o
1173	self.currentTag.contents.append(o)
1174
1175
1176	def _popToTag(self, name, inclusivePop=True):
1177	"""Pops the tag stack up to and including the most recent
1178	instance of the given tag. If inclusivePop is false, pops the tag
1179	stack up to but not including the most recent instqance of
1180	the given tag."""
1181	#print "Popping to %s" % name
1182	if name == self.ROOT_TAG_NAME:
1183	return
1184
1185	numPops = 0
1186	mostRecentTag = None
1187	for i in range(len(self.tagStack)-1, 0, -1):
1188	if name == self.tagStack[i].name:
1189	numPops = len(self.tagStack)-i
1190	break
1191	if not inclusivePop:
1192	numPops = numPops - 1
1193
1194	for i in range(0, numPops):
1195	mostRecentTag = self.popTag()
1196	return mostRecentTag
1197
1198	def _smartPop(self, name):
1199
1200	"""We need to pop up to the previous tag of this type, unless
1201	one of this tag's nesting reset triggers comes between this
1202	tag and the previous tag of this type, OR unless this tag is a
1203	generic nesting trigger and another generic nesting trigger
1204	comes between this tag and the previous tag of this type.
1205
1206	Examples:
1207	<p>Foo<b>Bar <p> should pop to 'p', not 'b'.
1208	<p>Foo<table>Bar <p> should pop to 'table', not 'p'.
1209	<p>Foo<table><tr>Bar <p> should pop to 'tr', not 'p'.
1210
1211	<li><ul><li> <li> should pop to 'ul', not the first 'li'.
1212	<tr><table><tr> <tr> should pop to 'table', not the first 'tr'
1213	<td><tr><td> <td> should pop to 'tr', not the first 'td'
1214	"""
1215
1216	nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1217	isNestable = nestingResetTriggers != None
1218	isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1219	popTo = None
1220	inclusive = True
1221	for i in range(len(self.tagStack)-1, 0, -1):
1222	p = self.tagStack[i]
1223	if (not p or p.name == name) and not isNestable:
1224	#Non-nestable tags get popped to the top or to their
1225	#last occurance.
1226	popTo = name
1227	break
1228	if (nestingResetTriggers != None
1229	and p.name in nestingResetTriggers) \
1230	or (nestingResetTriggers == None and isResetNesting
1231	and self.RESET_NESTING_TAGS.has_key(p.name)):
1232
1233	#If we encounter one of the nesting reset triggers
1234	#peculiar to this tag, or we encounter another tag
1235	#that causes nesting to reset, pop up to but not
1236	#including that tag.
1237	popTo = p.name
1238	inclusive = False
1239	break
1240	p = p.parent
1241	if popTo:
1242	self._popToTag(popTo, inclusive)
1243
1244	def unknown_starttag(self, name, attrs, selfClosing=0):
1245	#print "Start tag %s: %s" % (name, attrs)
1246	if self.quoteStack:
1247	#This is not a real tag.
1248	#print "<%s> is not real!" % name
1249	attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1250	self.handle_data('<%s%s>' % (name, attrs))
1251	return
1252	self.endData()
1253
1254	if not self.isSelfClosingTag(name) and not selfClosing:
1255	self._smartPop(name)
1256
1257	if self.parseOnlyThese and len(self.tagStack) <= 1 \
1258	and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1259	return
1260
1261	tag = Tag(self, name, attrs, self.currentTag, self.previous)
1262	if self.previous:
1263	self.previous.next = tag
1264	self.previous = tag
1265	self.pushTag(tag)
1266	if selfClosing or self.isSelfClosingTag(name):
1267	self.popTag()
1268	if name in self.QUOTE_TAGS:
1269	#print "Beginning quote (%s)" % name
1270	self.quoteStack.append(name)
1271	self.literal = 1
1272	return tag
1273
1274	def unknown_endtag(self, name):
1275	#print "End tag %s" % name
1276	if self.quoteStack and self.quoteStack[-1] != name:
1277	#This is not a real end tag.
1278	#print "</%s> is not real!" % name
1279	self.handle_data('</%s>' % name)
1280	return
1281	self.endData()
1282	self._popToTag(name)
1283	if self.quoteStack and self.quoteStack[-1] == name:
1284	self.quoteStack.pop()
1285	self.literal = (len(self.quoteStack) > 0)
1286
1287	def handle_data(self, data):
1288	self.currentData.append(data)
1289
1290	def _toStringSubclass(self, text, subclass):
1291	"""Adds a certain piece of text to the tree as a NavigableString
1292	subclass."""
1293	self.endData()
1294	self.handle_data(text)
1295	self.endData(subclass)
1296
1297	def handle_pi(self, text):
1298	"""Handle a processing instruction as a ProcessingInstruction
1299	object, possibly one with a %SOUP-ENCODING% slot into which an
1300	encoding will be plugged later."""
1301	if text[:3] == "xml":
1302	text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1303	self._toStringSubclass(text, ProcessingInstruction)
1304
1305	def handle_comment(self, text):
1306	"Handle comments as Comment objects."
1307	self._toStringSubclass(text, Comment)
1308
1309	def handle_charref(self, ref):
1310	"Handle character references as data."
1311	if self.convertEntities:
1312	data = unichr(int(ref))
1313	else:
1314	data = '&#%s;' % ref
1315	self.handle_data(data)
1316
1317	def handle_entityref(self, ref):
1318	"""Handle entity references as data, possibly converting known
1319	HTML and/or XML entity references to the corresponding Unicode
1320	characters."""
1321	data = None
1322	if self.convertHTMLEntities:
1323	try:
1324	data = unichr(name2codepoint[ref])
1325	except KeyError:
1326	pass
1327
1328	if not data and self.convertXMLEntities:
1329	data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1330
1331	if not data and self.convertHTMLEntities and \
1332	not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1333	# TODO: We've got a problem here. We're told this is
1334	# an entity reference, but it's not an XML entity
1335	# reference or an HTML entity reference. Nonetheless,
1336	# the logical thing to do is to pass it through as an
1337	# unrecognized entity reference.
1338	#
1339	# Except: when the input is "&carol;" this function
1340	# will be called with input "carol". When the input is
1341	# "AT&T", this function will be called with input
1342	# "T". We have no way of knowing whether a semicolon
1343	# was present originally, so we don't know whether
1344	# this is an unknown entity or just a misplaced
1345	# ampersand.
1346	#
1347	# The more common case is a misplaced ampersand, so I
1348	# escape the ampersand and omit the trailing semicolon.
1349	data = "&%s" % ref
1350	if not data:
1351	# This case is different from the one above, because we
1352	# haven't already gone through a supposedly comprehensive
1353	# mapping of entities to Unicode characters. We might not
1354	# have gone through any mapping at all. So the chances are
1355	# very high that this is a real entity, and not a
1356	# misplaced ampersand.
1357	data = "&%s;" % ref
1358	self.handle_data(data)
1359
1360	def handle_decl(self, data):
1361	"Handle DOCTYPEs and the like as Declaration objects."
1362	self._toStringSubclass(data, Declaration)
1363
1364	def parse_declaration(self, i):
1365	"""Treat a bogus SGML declaration as raw data. Treat a CDATA
1366	declaration as a CData object."""
1367	j = None
1368	if self.rawdata[i:i+9] == '<![CDATA[':
1369	k = self.rawdata.find(']]>', i)
1370	if k == -1:
1371	k = len(self.rawdata)
1372	data = self.rawdata[i+9:k]
1373	j = k+3
1374	self._toStringSubclass(data, CData)
1375	else:
1376	try:
1377	j = SGMLParser.parse_declaration(self, i)
1378	except SGMLParseError:
1379	toHandle = self.rawdata[i:]
1380	self.handle_data(toHandle)
1381	j = i + len(toHandle)
1382	return j
1383
1384	class BeautifulSoup(BeautifulStoneSoup):
1385
1386	"""This parser knows the following facts about HTML:
1387
1388	* Some tags have no closing tag and should be interpreted as being
1389	closed as soon as they are encountered.
1390
1391	* The text inside some tags (ie. 'script') may contain tags which
1392	are not really part of the document and which should be parsed
1393	as text, not tags. If you want to parse the text as tags, you can
1394	always fetch it and parse it explicitly.
1395
1396	* Tag nesting rules:
1397
1398	Most tags can't be nested at all. For instance, the occurance of
1399	a <p> tag should implicitly close the previous <p> tag.
1400
1401	<p>Para1<p>Para2
1402	should be transformed into:
1403	<p>Para1</p><p>Para2
1404
1405	Some tags can be nested arbitrarily. For instance, the occurance
1406	of a <blockquote> tag should _not_ implicitly close the previous
1407	<blockquote> tag.
1408
1409	Alice said: <blockquote>Bob said: <blockquote>Blah
1410	should NOT be transformed into:
1411	Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1412
1413	Some tags can be nested, but the nesting is reset by the
1414	interposition of other tags. For instance, a <tr> tag should
1415	implicitly close the previous <tr> tag within the same <table>,
1416	but not close a <tr> tag in another table.
1417
1418	<table><tr>Blah<tr>Blah
1419	should be transformed into:
1420	<table><tr>Blah</tr><tr>Blah
1421	but,
1422	<tr>Blah<table><tr>Blah
1423	should NOT be transformed into
1424	<tr>Blah<table></tr><tr>Blah
1425
1426	Differing assumptions about tag nesting rules are a major source
1427	of problems with the BeautifulSoup class. If BeautifulSoup is not
1428	treating as nestable a tag your page author treats as nestable,
1429	try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1430	BeautifulStoneSoup before writing your own subclass."""
1431
1432	def __init__(self, args, *kwargs):
1433	if not kwargs.has_key('smartQuotesTo'):
1434	kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1435	BeautifulStoneSoup.__init__(self, args, *kwargs)
1436
1437	SELF_CLOSING_TAGS = buildTagMap(None,
1438	['br' , 'hr', 'input', 'img', 'meta',
1439	'spacer', 'link', 'frame', 'base'])
1440
1441	QUOTE_TAGS = {'script' : None, 'textarea' : None}
1442
1443	#According to the HTML standard, each of these inline tags can
1444	#contain another tag of the same type. Furthermore, it's common
1445	#to actually use these tags this way.
1446	NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1447	'center']
1448
1449	#According to the HTML standard, these block tags can contain
1450	#another tag of the same type. Furthermore, it's common
1451	#to actually use these tags this way.
1452	NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1453
1454	#Lists can contain other lists, but there are restrictions.
1455	NESTABLE_LIST_TAGS = { 'ol' : [],
1456	'ul' : [],
1457	'li' : ['ul', 'ol'],
1458	'dl' : [],
1459	'dd' : ['dl'],
1460	'dt' : ['dl'] }
1461
1462	#Tables can contain other tables, but there are restrictions.
1463	NESTABLE_TABLE_TAGS = {'table' : [],
1464	'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1465	'td' : ['tr'],
1466	'th' : ['tr'],
1467	'thead' : ['table'],
1468	'tbody' : ['table'],
1469	'tfoot' : ['table'],
1470	}
1471
1472	NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1473
1474	#If one of these tags is encountered, all tags up to the next tag of
1475	#this type are popped.
1476	RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1477	NON_NESTABLE_BLOCK_TAGS,
1478	NESTABLE_LIST_TAGS,
1479	NESTABLE_TABLE_TAGS)
1480
1481	NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1482	NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1483
1484	# Used to detect the charset in a META tag; see start_meta
1485	CHARSET_RE = re.compile("((^\|;)\scharset=)([^;])")
1486
1487	def start_meta(self, attrs):
1488	"""Beautiful Soup can detect a charset included in a META tag,
1489	try to convert the document to that charset, and re-parse the
1490	document from the beginning."""
1491	httpEquiv = None
1492	contentType = None
1493	contentTypeIndex = None
1494	tagNeedsEncodingSubstitution = False
1495
1496	for i in range(0, len(attrs)):
1497	key, value = attrs[i]
1498	key = key.lower()
1499	if key == 'http-equiv':
1500	httpEquiv = value
1501	elif key == 'content':
1502	contentType = value
1503	contentTypeIndex = i
1504
1505	if httpEquiv and contentType: # It's an interesting meta tag.
1506	match = self.CHARSET_RE.search(contentType)
1507	if match:
1508	if getattr(self, 'declaredHTMLEncoding') or \
1509	(self.originalEncoding == self.fromEncoding):
1510	# This is our second pass through the document, or
1511	# else an encoding was specified explicitly and it
1512	# worked. Rewrite the meta tag.
1513	newAttr = self.CHARSET_RE.sub\
1514	(lambda(match):match.group(1) +
1515	"%SOUP-ENCODING%", value)
1516	attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1517	newAttr)
1518	tagNeedsEncodingSubstitution = True
1519	else:
1520	# This is our first pass through the document.
1521	# Go through it again with the new information.
1522	newCharset = match.group(3)
1523	if newCharset and newCharset != self.originalEncoding:
1524	self.declaredHTMLEncoding = newCharset
1525	self._feed(self.declaredHTMLEncoding)
1526	raise StopParsing
1527	tag = self.unknown_starttag("meta", attrs)
1528	if tag and tagNeedsEncodingSubstitution:
1529	tag.containsSubstitutions = True
1530
1531	class StopParsing(Exception):
1532	pass
1533
1534	class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1535
1536	"""The BeautifulSoup class is oriented towards skipping over
1537	common HTML errors like unclosed tags. However, sometimes it makes
1538	errors of its own. For instance, consider this fragment:
1539
1540	<b>Foo<b>Bar</b></b>
1541
1542	This is perfectly valid (if bizarre) HTML. However, the
1543	BeautifulSoup class will implicitly close the first b tag when it
1544	encounters the second 'b'. It will think the author wrote
1545	"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1546	there's no real-world reason to bold something that's already
1547	bold. When it encounters '</b></b>' it will close two more 'b'
1548	tags, for a grand total of three tags closed instead of two. This
1549	can throw off the rest of your document structure. The same is
1550	true of a number of other tags, listed below.
1551
1552	It's much more common for someone to forget to close a 'b' tag
1553	than to actually use nested 'b' tags, and the BeautifulSoup class
1554	handles the common case. This class handles the not-co-common
1555	case: where you can't believe someone wrote what they did, but
1556	it's valid HTML and BeautifulSoup screwed up by assuming it
1557	wouldn't be."""
1558
1559	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1560	['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1561	'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1562	'big']
1563
1564	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1565
1566	NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1567	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1568	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1569
1570	class MinimalSoup(BeautifulSoup):
1571	"""The MinimalSoup class is for parsing HTML that contains
1572	pathologically bad markup. It makes no assumptions about tag
1573	nesting, but it does know which tags are self-closing, that
1574	<script> tags contain Javascript and should not be parsed, that
1575	META tags may contain encoding information, and so on.
1576
1577	This also makes it better for subclassing than BeautifulStoneSoup
1578	or BeautifulSoup."""
1579
1580	RESET_NESTING_TAGS = buildTagMap('noscript')
1581	NESTABLE_TAGS = {}
1582
1583	class BeautifulSOAP(BeautifulStoneSoup):
1584	"""This class will push a tag with only a single string child into
1585	the tag's parent as an attribute. The attribute's name is the tag
1586	name, and the value is the string child. An example should give
1587	the flavor of the change:
1588
1589	<foo><bar>baz</bar></foo>
1590	=>
1591	<foo bar="baz"><bar>baz</bar></foo>
1592
1593	You can then access fooTag['bar'] instead of fooTag.barTag.string.
1594
1595	This is, of course, useful for scraping structures that tend to
1596	use subelements instead of attributes, such as SOAP messages. Note
1597	that it modifies its input, so don't print the modified version
1598	out.
1599
1600	I'm not sure how many people really want to use this class; let me
1601	know if you do. Mainly I like the name."""
1602
1603	def popTag(self):
1604	if len(self.tagStack) > 1:
1605	tag = self.tagStack[-1]
1606	parent = self.tagStack[-2]
1607	parent._getAttrMap()
1608	if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1609	isinstance(tag.contents[0], NavigableString) and
1610	not parent.attrMap.has_key(tag.name)):
1611	parent[tag.name] = tag.contents[0]
1612	BeautifulStoneSoup.popTag(self)
1613
1614	#Enterprise class names! It has come to our attention that some people
1615	#think the names of the Beautiful Soup parser classes are too silly
1616	#and "unprofessional" for use in enterprise screen-scraping. We feel
1617	#your pain! For such-minded folk, the Beautiful Soup Consortium And
1618	#All-Night Kosher Bakery recommends renaming this file to
1619	#"RobustParser.py" (or, in cases of extreme enterprisiness,
1620	#"RobustParserBeanInterface.class") and using the following
1621	#enterprise-friendly class aliases:
1622	class RobustXMLParser(BeautifulStoneSoup):
1623	pass
1624	class RobustHTMLParser(BeautifulSoup):
1625	pass
1626	class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1627	pass
1628	class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1629	pass
1630	class SimplifyingSOAPParser(BeautifulSOAP):
1631	pass
1632
1633	######################################################
1634	#
1635	# Bonus library: Unicode, Dammit
1636	#
1637	# This class forces XML data into a standard format (usually to UTF-8
1638	# or Unicode). It is heavily based on code from Mark Pilgrim's
1639	# Universal Feed Parser. It does not rewrite the XML or HTML to
1640	# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1641	# (XML) and BeautifulSoup.start_meta (HTML).
1642
1643	# Autodetects character encodings.
1644	# Download from http://chardet.feedparser.org/
1645	try:
1646	import chardet
1647	# import chardet.constants
1648	# chardet.constants._debug = 1
1649	except:
1650	chardet = None
1651	chardet = None
1652
1653	# cjkcodecs and iconv_codec make Python know about more character encodings.
1654	# Both are available from http://cjkpython.i18n.org/
1655	# They're built in if you use Python 2.4.
1656	try:
1657	import cjkcodecs.aliases
1658	except:
1659	pass
1660	try:
1661	import iconv_codec
1662	except:
1663	pass
1664
1665	class UnicodeDammit:
1666	"""A class for detecting the encoding of a *ML document and
1667	converting it to a Unicode string. If the source encoding is
1668	windows-1252, can replace MS smart quotes with their HTML or XML
1669	equivalents."""
1670
1671	# This dictionary maps commonly seen values for "charset" in HTML
1672	# meta tags to the corresponding Python codec names. It only covers
1673	# values that aren't in Python's aliases and can't be determined
1674	# by the heuristics in find_codec.
1675	CHARSET_ALIASES = { "macintosh" : "mac-roman",
1676	"x-sjis" : "shift-jis" }
1677
1678	def __init__(self, markup, overrideEncodings=[],
1679	smartQuotesTo='xml'):
1680	self.markup, documentEncoding, sniffedEncoding = \
1681	self._detectEncoding(markup)
1682	self.smartQuotesTo = smartQuotesTo
1683	self.triedEncodings = []
1684	if markup == '' or isinstance(markup, unicode):
1685	self.originalEncoding = None
1686	self.unicode = unicode(markup)
1687	return
1688
1689	u = None
1690	for proposedEncoding in overrideEncodings:
1691	u = self._convertFrom(proposedEncoding)
1692	if u: break
1693	if not u:
1694	for proposedEncoding in (documentEncoding, sniffedEncoding):
1695	u = self._convertFrom(proposedEncoding)
1696	if u: break
1697
1698	# If no luck and we have auto-detection library, try that:
1699	if not u and chardet and not isinstance(self.markup, unicode):
1700	u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1701
1702	# As a last resort, try utf-8 and windows-1252:
1703	if not u:
1704	for proposed_encoding in ("utf-8", "windows-1252"):
1705	u = self._convertFrom(proposed_encoding)
1706	if u: break
1707	self.unicode = u
1708	if not u: self.originalEncoding = None
1709
1710	def _subMSChar(self, orig):
1711	"""Changes a MS smart quote character to an XML or HTML
1712	entity."""
1713	sub = self.MS_CHARS.get(orig)
1714	if type(sub) == types.TupleType:
1715	if self.smartQuotesTo == 'xml':
1716	sub = '&#x%s;' % sub[1]
1717	else:
1718	sub = '&%s;' % sub[0]
1719	return sub
1720
1721	def _convertFrom(self, proposed):
1722	proposed = self.find_codec(proposed)
1723	if not proposed or proposed in self.triedEncodings:
1724	return None
1725	self.triedEncodings.append(proposed)
1726	markup = self.markup
1727
1728	# Convert smart quotes to HTML if coming from an encoding
1729	# that might have them.
1730	if self.smartQuotesTo and proposed.lower() in("windows-1252",
1731	"iso-8859-1",
1732	"iso-8859-2"):
1733	markup = re.compile("([\x80-\x9f])").sub \
1734	(lambda(x): self._subMSChar(x.group(1)),
1735	markup)
1736
1737	try:
1738	# print "Trying to convert document to %s" % proposed
1739	u = self._toUnicode(markup, proposed)
1740	self.markup = u
1741	self.originalEncoding = proposed
1742	except Exception, e:
1743	# print "That didn't work!"
1744	# print e
1745	return None
1746	#print "Correct encoding: %s" % proposed
1747	return self.markup
1748
1749	def _toUnicode(self, data, encoding):
1750	'''Given a string and its encoding, decodes the string into Unicode.
1751	%encoding is a string recognized by encodings.aliases'''
1752
1753	# strip Byte Order Mark (if present)
1754	if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1755	and (data[2:4] != '\x00\x00'):
1756	encoding = 'utf-16be'
1757	data = data[2:]
1758	elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1759	and (data[2:4] != '\x00\x00'):
1760	encoding = 'utf-16le'
1761	data = data[2:]
1762	elif data[:3] == '\xef\xbb\xbf':
1763	encoding = 'utf-8'
1764	data = data[3:]
1765	elif data[:4] == '\x00\x00\xfe\xff':
1766	encoding = 'utf-32be'
1767	data = data[4:]
1768	elif data[:4] == '\xff\xfe\x00\x00':
1769	encoding = 'utf-32le'
1770	data = data[4:]
1771	newdata = unicode(data, encoding)
1772	return newdata
1773
1774	def _detectEncoding(self, xml_data):
1775	"""Given a document, tries to detect its XML encoding."""
1776	xml_encoding = sniffed_xml_encoding = None
1777	try:
1778	if xml_data[:4] == '\x4c\x6f\xa7\x94':
1779	# EBCDIC
1780	xml_data = self._ebcdic_to_ascii(xml_data)
1781	elif xml_data[:4] == '\x00\x3c\x00\x3f':
1782	# UTF-16BE
1783	sniffed_xml_encoding = 'utf-16be'
1784	xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1785	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1786	and (xml_data[2:4] != '\x00\x00'):
1787	# UTF-16BE with BOM
1788	sniffed_xml_encoding = 'utf-16be'
1789	xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1790	elif xml_data[:4] == '\x3c\x00\x3f\x00':
1791	# UTF-16LE
1792	sniffed_xml_encoding = 'utf-16le'
1793	xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1794	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1795	(xml_data[2:4] != '\x00\x00'):
1796	# UTF-16LE with BOM
1797	sniffed_xml_encoding = 'utf-16le'
1798	xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1799	elif xml_data[:4] == '\x00\x00\x00\x3c':
1800	# UTF-32BE
1801	sniffed_xml_encoding = 'utf-32be'
1802	xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1803	elif xml_data[:4] == '\x3c\x00\x00\x00':
1804	# UTF-32LE
1805	sniffed_xml_encoding = 'utf-32le'
1806	xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1807	elif xml_data[:4] == '\x00\x00\xfe\xff':
1808	# UTF-32BE with BOM
1809	sniffed_xml_encoding = 'utf-32be'
1810	xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1811	elif xml_data[:4] == '\xff\xfe\x00\x00':
1812	# UTF-32LE with BOM
1813	sniffed_xml_encoding = 'utf-32le'
1814	xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1815	elif xml_data[:3] == '\xef\xbb\xbf':
1816	# UTF-8 with BOM
1817	sniffed_xml_encoding = 'utf-8'
1818	xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1819	else:
1820	sniffed_xml_encoding = 'ascii'
1821	pass
1822	xml_encoding_match = re.compile \
1823	('^<\?.encoding=[\'"](.?)[\'"].*\?>')\
1824	.match(xml_data)
1825	except:
1826	xml_encoding_match = None
1827	if xml_encoding_match:
1828	xml_encoding = xml_encoding_match.groups()[0].lower()
1829	if sniffed_xml_encoding and \
1830	(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1831	'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1832	'utf-16', 'utf-32', 'utf_16', 'utf_32',
1833	'utf16', 'u16')):
1834	xml_encoding = sniffed_xml_encoding
1835	return xml_data, xml_encoding, sniffed_xml_encoding
1836
1837
1838	def find_codec(self, charset):
1839	return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1840	or (charset and self._codec(charset.replace("-", ""))) \
1841	or (charset and self._codec(charset.replace("-", "_"))) \
1842	or charset
1843
1844	def _codec(self, charset):
1845	if not charset: return charset
1846	codec = None
1847	try:
1848	codecs.lookup(charset)
1849	codec = charset
1850	except (LookupError, ValueError):
1851	pass
1852	return codec
1853
1854	EBCDIC_TO_ASCII_MAP = None
1855	def _ebcdic_to_ascii(self, s):
1856	c = self.__class__
1857	if not c.EBCDIC_TO_ASCII_MAP:
1858	emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1859	16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1860	128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1861	144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1862	32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1863	38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1864	45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1865	186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1866	195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1867	201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1868	206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1869	211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1870	225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1871	73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1872	82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1873	90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1874	250,251,252,253,254,255)
1875	import string
1876	c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1877	''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1878	return s.translate(c.EBCDIC_TO_ASCII_MAP)
1879
1880	MS_CHARS = { '\x80' : ('euro', '20AC'),
1881	'\x81' : ' ',
1882	'\x82' : ('sbquo', '201A'),
1883	'\x83' : ('fnof', '192'),
1884	'\x84' : ('bdquo', '201E'),
1885	'\x85' : ('hellip', '2026'),
1886	'\x86' : ('dagger', '2020'),
1887	'\x87' : ('Dagger', '2021'),
1888	'\x88' : ('circ', '2C6'),
1889	'\x89' : ('permil', '2030'),
1890	'\x8A' : ('Scaron', '160'),
1891	'\x8B' : ('lsaquo', '2039'),
1892	'\x8C' : ('OElig', '152'),
1893	'\x8D' : '?',
1894	'\x8E' : ('#x17D', '17D'),
1895	'\x8F' : '?',
1896	'\x90' : '?',
1897	'\x91' : ('lsquo', '2018'),
1898	'\x92' : ('rsquo', '2019'),
1899	'\x93' : ('ldquo', '201C'),
1900	'\x94' : ('rdquo', '201D'),
1901	'\x95' : ('bull', '2022'),
1902	'\x96' : ('ndash', '2013'),
1903	'\x97' : ('mdash', '2014'),
1904	'\x98' : ('tilde', '2DC'),
1905	'\x99' : ('trade', '2122'),
1906	'\x9a' : ('scaron', '161'),
1907	'\x9b' : ('rsaquo', '203A'),
1908	'\x9c' : ('oelig', '153'),
1909	'\x9d' : '?',
1910	'\x9e' : ('#x17E', '17E'),
1911	'\x9f' : ('Yuml', ''),}
1912
1913	#######################################################################
1914
1915
1916	#By default, act as an HTML pretty-printer.
1917	if __name__ == '__main__':
1918	import sys
1919	soup = BeautifulSoup(sys.stdin.read())
1920	print soup.prettify()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/ngramj/src/wiki/BeautifulSoup.py@ 31803

Download in other formats: