Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

wikisoup.py@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago
NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!
File size: 7.2 KB

Line
1	#!/usr/bin/python
2	# Take a MediaWiki article in XML format, as produced by wiki2xml, and extract
3	# plain text. Uses the BeautifulSoup parser, since wiki2xml's output is not XML.
4	#
5	# Evan Jones <[email protected]>
6	# April, 2008
7	# Released under a BSD licence.
8	# http://evanjones.ca/software/wikipedia2text.html
9
10	import htmlentitydefs
11	import re
12
13	import BeautifulSoup
14
15	# By default, BeautifulStoneSoup doesn't allow nesting
16	class WikiSoup(BeautifulSoup.BeautifulStoneSoup):
17	# Allow nesting most tags except <paragraph> and <heading>
18
19	NESTABLE_TAGS = {
20	# Forces a <heading> tag to pop back up to <article>.
21	"heading": ["article"],
22
23	"link": [],
24
25	#Maybe only allow these under "link"?
26	"target": [],
27	"part": [],
28	"trail": [],
29
30	"extension": [],
31	"template": [],
32	"arg": [],
33
34	"list": [],
35	"listitem": [],
36
37	"table": [],
38	"tablerow": [],
39	"tablecell": [],
40
41	"bold": [],
42	"italics": [],
43
44	"sup": [],
45	"sub": [],
46	"preblock": [],
47	"preline": [],
48	}
49
50	SELF_CLOSING_TAGS = { "space": None }
51
52	def __init__(self, data):
53	BeautifulSoup.BeautifulStoneSoup.__init__(self, data,
54	convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES)
55
56
57	# Set of plain text tags: we will extract text from inside these tags
58	PLAIN_TAGS = set([
59	"bold",
60	"italics",
61	"sup",
62	"sub",
63	"preblock",
64	"preline",
65
66	"templatevar", # Used for some quote templates
67	"part",
68	])
69
70
71	def extractLinkText(linkNode):
72	"""Extract text from a <link> tag."""
73	assert linkNode.name == "link"
74
75	try:
76	if len(linkNode.contents) == 0:
77	# <link href="..." type="external" />
78	return None
79
80	first = linkNode.contents[0]
81	if isinstance(first, BeautifulSoup.NavigableString):
82	# <link href="..." type="external">text</link>
83	assert linkNode["type"] == "external"
84	# External links could contain tags such as <space />
85	return "".join(extractText(linkNode))
86
87	assert first.name == "target"
88	# <target> can contain other tags, in particular <template>
89	target_text = "".join(extractText(first))
90
91	# Skip Image, Category and language links
92	if ":" in target_text:
93	return None
94
95	# target part? trail?
96	# <link><target>foo</target><part>words</part></link>
97	# <link><target>foo</target><part>word</part><trail>s</trail></link>
98	# <link><target>foo</target><trail>s</trail></link>
99	assert len(linkNode.contents) <= 3
100	text = None
101	foundPart = False
102	foundTrail = False
103	for child in linkNode:
104	assert not foundTrail
105	if child.name == "target":
106	# If the target contains more than one thing, then this is a bad link: extract nothing
107	if len(child.contents) != 1:
108	return None
109	assert text is None
110	text = child.string
111	elif child.name == "part":
112	# Only take the first <part>. There should only be one, but sometimes users add more
113	if foundPart:
114	continue
115	assert text is not None
116	foundPart = True
117	# The <part> can have HTML tags like <part>77<sup>th</sup></part>
118	# Or worse, <template>
119	text = "".join(extractText(child))
120
121	elif child.name == "trail":
122	assert text is not None
123	foundTrail = True
124	text += child.string
125	else:
126	assert False
127	return text
128	except:
129	print linkNode
130	raise
131
132	# All the tags that should be skipped
133	SKIP_TAGS = set([
134	"template",
135	"ref",
136	"table",
137	"tablerow",
138	"tablecell",
139	"magic_variable",
140	"list",
141	])
142	# All the extensions that should be skipped
143	SKIP_EXTENSIONS = set([
144	"ref",
145	"references",
146	"imagemap",
147	"gallery",
148	"math",
149	"hr",
150	"timeline",
151	"poem",
152	"hiero",
153	])
154	INCLUDE_EXTENSIONS = set([
155	"blockquote",
156	"noinclude",
157	"onlyinclude",
158	"includeonly",
159	"nowiki",
160	"var", # Variables: needed to understand math/physics
161	"sarcasm", # Incorrectly parsed <sarcasm> tags in the "Leet" article
162	])
163
164	def extractText(paragraph_node):
165	"""Returns text extracted from Wikipedia XML <paragraph> nodes."""
166	text = []
167	for child in paragraph_node:
168	if isinstance(child, BeautifulSoup.NavigableString):
169	text.append(child.string)
170	elif child.name in SKIP_TAGS:
171	# Skip the contents of templates, references and tables
172	continue
173	elif child.name == "extension":
174	if len(child.contents) == 0:
175	# If the extension is empty we don't care.
176	continue
177	name = child["extension_name"]
178	if name in SKIP_EXTENSIONS:
179	continue
180	elif name in INCLUDE_EXTENSIONS:
181	# Extract text from extensions which just include text
182	text.extend(extractText(child))
183	else:
184	print child
185	raise "Unknown extension"
186	elif child.name == "link":
187	extracted = extractLinkText(child)
188	if extracted is not None:
189	text.append(extracted)
190	elif child.name == "space":
191	assert len(child.contents) == 0
192	text.append(" ")
193	else:
194	# Recursively extract text out of tags like <italics>
195	if not (child.name in PLAIN_TAGS or child.name.startswith("xhtml")):
196	print child
197	assert child.name in PLAIN_TAGS or child.name.startswith("xhtml")
198	text.extend(extractText(child))
199
200	return text
201
202
203	# Stolen from scrape: http://zesty.ca/python/scrape.py
204	HTML_ENTITY = re.compile(r'&(#(\d+\|x[\da-fA-F]+)\|[\w.:-]+);?')
205	def HTMLDecode(text):
206	"""Decodes HTML entities in text."""
207
208	def HTMLEntityReplace(match):
209	entity = match.group(1)
210	if entity.startswith('#x'):
211	return unichr(int(entity[2:], 16))
212	elif entity.startswith('#'):
213	return unichr(int(entity[1:]))
214	elif entity in htmlentitydefs.name2codepoint:
215	return unichr(htmlentitydefs.name2codepoint[entity])
216	else:
217	return match.group(0)
218
219	return HTML_ENTITY.sub(HTMLEntityReplace, text)
220
221
222	def extractWikipediaText(filename):
223	"""Extract text from a Wikipedia article in XML format. Returns Unicode text normalized in NFKC form."""
224
225	input = open(filename)
226	data = input.read()
227	input.close()
228
229	dom = WikiSoup(data)
230	text = []
231	# Iterate over the paragraphs
232	for paragraph in dom.findAll("paragraph"):
233	try:
234	parts = extractText(paragraph)
235	except:
236	print paragraph
237	raise
238	for i, fragment in enumerate(parts):
239	# wiki2xml does not convert   or other HTML entities
240	parts[i] = HTMLDecode(fragment)
241	text.extend(parts)
242	text.append("\n\n")
243
244	return "".join(text)
245
246
247	if __name__ == "__main__":
248	import sys
249	sys.stdout.write(extractWikipediaText(sys.argv[1]).encode("UTF-8"))

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wikisoup.py@ 25141

Download in other formats: