source: gs2-extensions/ngramj/src/wiki/wikipedia2text/wikisoup.py@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 7.2 KB
Line 
1#!/usr/bin/python
2# Take a MediaWiki article in XML format, as produced by wiki2xml, and extract
3# plain text. Uses the BeautifulSoup parser, since wiki2xml's output is not XML.
4#
5# Evan Jones <[email protected]>
6# April, 2008
7# Released under a BSD licence.
8# http://evanjones.ca/software/wikipedia2text.html
9
10import htmlentitydefs
11import re
12
13import BeautifulSoup
14
15# By default, BeautifulStoneSoup doesn't allow nesting
16class WikiSoup(BeautifulSoup.BeautifulStoneSoup):
17 # Allow nesting most tags except <paragraph> and <heading>
18
19 NESTABLE_TAGS = {
20 # Forces a <heading> tag to pop back up to <article>.
21 "heading": ["article"],
22
23 "link": [],
24
25 #Maybe only allow these under "link"?
26 "target": [],
27 "part": [],
28 "trail": [],
29
30 "extension": [],
31 "template": [],
32 "arg": [],
33
34 "list": [],
35 "listitem": [],
36
37 "table": [],
38 "tablerow": [],
39 "tablecell": [],
40
41 "bold": [],
42 "italics": [],
43
44 "sup": [],
45 "sub": [],
46 "preblock": [],
47 "preline": [],
48 }
49
50 SELF_CLOSING_TAGS = { "space": None }
51
52 def __init__(self, data):
53 BeautifulSoup.BeautifulStoneSoup.__init__(self, data,
54 convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES)
55
56
57# Set of plain text tags: we will extract text from inside these tags
58PLAIN_TAGS = set([
59 "bold",
60 "italics",
61 "sup",
62 "sub",
63 "preblock",
64 "preline",
65
66 "templatevar", # Used for some quote templates
67 "part",
68])
69
70
71def extractLinkText(linkNode):
72 """Extract text from a <link> tag."""
73 assert linkNode.name == "link"
74
75 try:
76 if len(linkNode.contents) == 0:
77 # <link href="..." type="external" />
78 return None
79
80 first = linkNode.contents[0]
81 if isinstance(first, BeautifulSoup.NavigableString):
82 # <link href="..." type="external">text</link>
83 assert linkNode["type"] == "external"
84 # External links could contain tags such as <space />
85 return "".join(extractText(linkNode))
86
87 assert first.name == "target"
88 # <target> can contain other tags, in particular <template>
89 target_text = "".join(extractText(first))
90
91 # Skip Image, Category and language links
92 if ":" in target_text:
93 return None
94
95 # target part? trail?
96 # <link><target>foo</target><part>words</part></link>
97 # <link><target>foo</target><part>word</part><trail>s</trail></link>
98 # <link><target>foo</target><trail>s</trail></link>
99 assert len(linkNode.contents) <= 3
100 text = None
101 foundPart = False
102 foundTrail = False
103 for child in linkNode:
104 assert not foundTrail
105 if child.name == "target":
106 # If the target contains more than one thing, then this is a bad link: extract nothing
107 if len(child.contents) != 1:
108 return None
109 assert text is None
110 text = child.string
111 elif child.name == "part":
112 # Only take the first <part>. There should only be one, but sometimes users add more
113 if foundPart:
114 continue
115 assert text is not None
116 foundPart = True
117 # The <part> can have HTML tags like <part>77<sup>th</sup></part>
118 # Or worse, <template>
119 text = "".join(extractText(child))
120
121 elif child.name == "trail":
122 assert text is not None
123 foundTrail = True
124 text += child.string
125 else:
126 assert False
127 return text
128 except:
129 print linkNode
130 raise
131
132# All the tags that should be skipped
133SKIP_TAGS = set([
134 "template",
135 "ref",
136 "table",
137 "tablerow",
138 "tablecell",
139 "magic_variable",
140 "list",
141])
142# All the extensions that should be skipped
143SKIP_EXTENSIONS = set([
144 "ref",
145 "references",
146 "imagemap",
147 "gallery",
148 "math",
149 "hr",
150 "timeline",
151 "poem",
152 "hiero",
153])
154INCLUDE_EXTENSIONS = set([
155 "blockquote",
156 "noinclude",
157 "onlyinclude",
158 "includeonly",
159 "nowiki",
160 "var", # Variables: needed to understand math/physics
161 "sarcasm", # Incorrectly parsed <sarcasm> tags in the "Leet" article
162])
163
164def extractText(paragraph_node):
165 """Returns text extracted from Wikipedia XML <paragraph> nodes."""
166 text = []
167 for child in paragraph_node:
168 if isinstance(child, BeautifulSoup.NavigableString):
169 text.append(child.string)
170 elif child.name in SKIP_TAGS:
171 # Skip the contents of templates, references and tables
172 continue
173 elif child.name == "extension":
174 if len(child.contents) == 0:
175 # If the extension is empty we don't care.
176 continue
177 name = child["extension_name"]
178 if name in SKIP_EXTENSIONS:
179 continue
180 elif name in INCLUDE_EXTENSIONS:
181 # Extract text from extensions which just include text
182 text.extend(extractText(child))
183 else:
184 print child
185 raise "Unknown extension"
186 elif child.name == "link":
187 extracted = extractLinkText(child)
188 if extracted is not None:
189 text.append(extracted)
190 elif child.name == "space":
191 assert len(child.contents) == 0
192 text.append(" ")
193 else:
194 # Recursively extract text out of tags like <italics>
195 if not (child.name in PLAIN_TAGS or child.name.startswith("xhtml")):
196 print child
197 assert child.name in PLAIN_TAGS or child.name.startswith("xhtml")
198 text.extend(extractText(child))
199
200 return text
201
202
203# Stolen from scrape: http://zesty.ca/python/scrape.py
204HTML_ENTITY = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
205def HTMLDecode(text):
206 """Decodes HTML entities in text."""
207
208 def HTMLEntityReplace(match):
209 entity = match.group(1)
210 if entity.startswith('#x'):
211 return unichr(int(entity[2:], 16))
212 elif entity.startswith('#'):
213 return unichr(int(entity[1:]))
214 elif entity in htmlentitydefs.name2codepoint:
215 return unichr(htmlentitydefs.name2codepoint[entity])
216 else:
217 return match.group(0)
218
219 return HTML_ENTITY.sub(HTMLEntityReplace, text)
220
221
222def extractWikipediaText(filename):
223 """Extract text from a Wikipedia article in XML format. Returns Unicode text normalized in NFKC form."""
224
225 input = open(filename)
226 data = input.read()
227 input.close()
228
229 dom = WikiSoup(data)
230 text = []
231 # Iterate over the paragraphs
232 for paragraph in dom.findAll("paragraph"):
233 try:
234 parts = extractText(paragraph)
235 except:
236 print paragraph
237 raise
238 for i, fragment in enumerate(parts):
239 # wiki2xml does not convert &nbsp; or other HTML entities
240 parts[i] = HTMLDecode(fragment)
241 text.extend(parts)
242 text.append("\n\n")
243
244 return "".join(text)
245
246
247if __name__ == "__main__":
248 import sys
249 sys.stdout.write(extractWikipediaText(sys.argv[1]).encode("UTF-8"))
Note: See TracBrowser for help on using the repository browser.