1 | #!/usr/bin/python
|
---|
2 | # Take a MediaWiki article in XML format, as produced by wiki2xml, and extract
|
---|
3 | # plain text. Uses the BeautifulSoup parser, since wiki2xml's output is not XML.
|
---|
4 | #
|
---|
5 | # Evan Jones <[email protected]>
|
---|
6 | # April, 2008
|
---|
7 | # Released under a BSD licence.
|
---|
8 | # http://evanjones.ca/software/wikipedia2text.html
|
---|
9 |
|
---|
10 | import htmlentitydefs
|
---|
11 | import re
|
---|
12 |
|
---|
13 | import BeautifulSoup
|
---|
14 |
|
---|
15 | # By default, BeautifulStoneSoup doesn't allow nesting
|
---|
16 | class WikiSoup(BeautifulSoup.BeautifulStoneSoup):
|
---|
17 | # Allow nesting most tags except <paragraph> and <heading>
|
---|
18 |
|
---|
19 | NESTABLE_TAGS = {
|
---|
20 | # Forces a <heading> tag to pop back up to <article>.
|
---|
21 | "heading": ["article"],
|
---|
22 |
|
---|
23 | "link": [],
|
---|
24 |
|
---|
25 | #Maybe only allow these under "link"?
|
---|
26 | "target": [],
|
---|
27 | "part": [],
|
---|
28 | "trail": [],
|
---|
29 |
|
---|
30 | "extension": [],
|
---|
31 | "template": [],
|
---|
32 | "arg": [],
|
---|
33 |
|
---|
34 | "list": [],
|
---|
35 | "listitem": [],
|
---|
36 |
|
---|
37 | "table": [],
|
---|
38 | "tablerow": [],
|
---|
39 | "tablecell": [],
|
---|
40 |
|
---|
41 | "bold": [],
|
---|
42 | "italics": [],
|
---|
43 |
|
---|
44 | "sup": [],
|
---|
45 | "sub": [],
|
---|
46 | "preblock": [],
|
---|
47 | "preline": [],
|
---|
48 | }
|
---|
49 |
|
---|
50 | SELF_CLOSING_TAGS = { "space": None }
|
---|
51 |
|
---|
52 | def __init__(self, data):
|
---|
53 | BeautifulSoup.BeautifulStoneSoup.__init__(self, data,
|
---|
54 | convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES)
|
---|
55 |
|
---|
56 |
|
---|
57 | # Set of plain text tags: we will extract text from inside these tags
|
---|
58 | PLAIN_TAGS = set([
|
---|
59 | "bold",
|
---|
60 | "italics",
|
---|
61 | "sup",
|
---|
62 | "sub",
|
---|
63 | "preblock",
|
---|
64 | "preline",
|
---|
65 |
|
---|
66 | "templatevar", # Used for some quote templates
|
---|
67 | "part",
|
---|
68 | ])
|
---|
69 |
|
---|
70 |
|
---|
71 | def extractLinkText(linkNode):
|
---|
72 | """Extract text from a <link> tag."""
|
---|
73 | assert linkNode.name == "link"
|
---|
74 |
|
---|
75 | try:
|
---|
76 | if len(linkNode.contents) == 0:
|
---|
77 | # <link href="..." type="external" />
|
---|
78 | return None
|
---|
79 |
|
---|
80 | first = linkNode.contents[0]
|
---|
81 | if isinstance(first, BeautifulSoup.NavigableString):
|
---|
82 | # <link href="..." type="external">text</link>
|
---|
83 | assert linkNode["type"] == "external"
|
---|
84 | # External links could contain tags such as <space />
|
---|
85 | return "".join(extractText(linkNode))
|
---|
86 |
|
---|
87 | assert first.name == "target"
|
---|
88 | # <target> can contain other tags, in particular <template>
|
---|
89 | target_text = "".join(extractText(first))
|
---|
90 |
|
---|
91 | # Skip Image, Category and language links
|
---|
92 | if ":" in target_text:
|
---|
93 | return None
|
---|
94 |
|
---|
95 | # target part? trail?
|
---|
96 | # <link><target>foo</target><part>words</part></link>
|
---|
97 | # <link><target>foo</target><part>word</part><trail>s</trail></link>
|
---|
98 | # <link><target>foo</target><trail>s</trail></link>
|
---|
99 | assert len(linkNode.contents) <= 3
|
---|
100 | text = None
|
---|
101 | foundPart = False
|
---|
102 | foundTrail = False
|
---|
103 | for child in linkNode:
|
---|
104 | assert not foundTrail
|
---|
105 | if child.name == "target":
|
---|
106 | # If the target contains more than one thing, then this is a bad link: extract nothing
|
---|
107 | if len(child.contents) != 1:
|
---|
108 | return None
|
---|
109 | assert text is None
|
---|
110 | text = child.string
|
---|
111 | elif child.name == "part":
|
---|
112 | # Only take the first <part>. There should only be one, but sometimes users add more
|
---|
113 | if foundPart:
|
---|
114 | continue
|
---|
115 | assert text is not None
|
---|
116 | foundPart = True
|
---|
117 | # The <part> can have HTML tags like <part>77<sup>th</sup></part>
|
---|
118 | # Or worse, <template>
|
---|
119 | text = "".join(extractText(child))
|
---|
120 |
|
---|
121 | elif child.name == "trail":
|
---|
122 | assert text is not None
|
---|
123 | foundTrail = True
|
---|
124 | text += child.string
|
---|
125 | else:
|
---|
126 | assert False
|
---|
127 | return text
|
---|
128 | except:
|
---|
129 | print linkNode
|
---|
130 | raise
|
---|
131 |
|
---|
132 | # All the tags that should be skipped
|
---|
133 | SKIP_TAGS = set([
|
---|
134 | "template",
|
---|
135 | "ref",
|
---|
136 | "table",
|
---|
137 | "tablerow",
|
---|
138 | "tablecell",
|
---|
139 | "magic_variable",
|
---|
140 | "list",
|
---|
141 | ])
|
---|
142 | # All the extensions that should be skipped
|
---|
143 | SKIP_EXTENSIONS = set([
|
---|
144 | "ref",
|
---|
145 | "references",
|
---|
146 | "imagemap",
|
---|
147 | "gallery",
|
---|
148 | "math",
|
---|
149 | "hr",
|
---|
150 | "timeline",
|
---|
151 | "poem",
|
---|
152 | "hiero",
|
---|
153 | ])
|
---|
154 | INCLUDE_EXTENSIONS = set([
|
---|
155 | "blockquote",
|
---|
156 | "noinclude",
|
---|
157 | "onlyinclude",
|
---|
158 | "includeonly",
|
---|
159 | "nowiki",
|
---|
160 | "var", # Variables: needed to understand math/physics
|
---|
161 | "sarcasm", # Incorrectly parsed <sarcasm> tags in the "Leet" article
|
---|
162 | ])
|
---|
163 |
|
---|
164 | def extractText(paragraph_node):
|
---|
165 | """Returns text extracted from Wikipedia XML <paragraph> nodes."""
|
---|
166 | text = []
|
---|
167 | for child in paragraph_node:
|
---|
168 | if isinstance(child, BeautifulSoup.NavigableString):
|
---|
169 | text.append(child.string)
|
---|
170 | elif child.name in SKIP_TAGS:
|
---|
171 | # Skip the contents of templates, references and tables
|
---|
172 | continue
|
---|
173 | elif child.name == "extension":
|
---|
174 | if len(child.contents) == 0:
|
---|
175 | # If the extension is empty we don't care.
|
---|
176 | continue
|
---|
177 | name = child["extension_name"]
|
---|
178 | if name in SKIP_EXTENSIONS:
|
---|
179 | continue
|
---|
180 | elif name in INCLUDE_EXTENSIONS:
|
---|
181 | # Extract text from extensions which just include text
|
---|
182 | text.extend(extractText(child))
|
---|
183 | else:
|
---|
184 | print child
|
---|
185 | raise "Unknown extension"
|
---|
186 | elif child.name == "link":
|
---|
187 | extracted = extractLinkText(child)
|
---|
188 | if extracted is not None:
|
---|
189 | text.append(extracted)
|
---|
190 | elif child.name == "space":
|
---|
191 | assert len(child.contents) == 0
|
---|
192 | text.append(" ")
|
---|
193 | else:
|
---|
194 | # Recursively extract text out of tags like <italics>
|
---|
195 | if not (child.name in PLAIN_TAGS or child.name.startswith("xhtml")):
|
---|
196 | print child
|
---|
197 | assert child.name in PLAIN_TAGS or child.name.startswith("xhtml")
|
---|
198 | text.extend(extractText(child))
|
---|
199 |
|
---|
200 | return text
|
---|
201 |
|
---|
202 |
|
---|
203 | # Stolen from scrape: http://zesty.ca/python/scrape.py
|
---|
204 | HTML_ENTITY = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
---|
205 | def HTMLDecode(text):
|
---|
206 | """Decodes HTML entities in text."""
|
---|
207 |
|
---|
208 | def HTMLEntityReplace(match):
|
---|
209 | entity = match.group(1)
|
---|
210 | if entity.startswith('#x'):
|
---|
211 | return unichr(int(entity[2:], 16))
|
---|
212 | elif entity.startswith('#'):
|
---|
213 | return unichr(int(entity[1:]))
|
---|
214 | elif entity in htmlentitydefs.name2codepoint:
|
---|
215 | return unichr(htmlentitydefs.name2codepoint[entity])
|
---|
216 | else:
|
---|
217 | return match.group(0)
|
---|
218 |
|
---|
219 | return HTML_ENTITY.sub(HTMLEntityReplace, text)
|
---|
220 |
|
---|
221 |
|
---|
222 | def extractWikipediaText(filename):
|
---|
223 | """Extract text from a Wikipedia article in XML format. Returns Unicode text normalized in NFKC form."""
|
---|
224 |
|
---|
225 | input = open(filename)
|
---|
226 | data = input.read()
|
---|
227 | input.close()
|
---|
228 |
|
---|
229 | dom = WikiSoup(data)
|
---|
230 | text = []
|
---|
231 | # Iterate over the paragraphs
|
---|
232 | for paragraph in dom.findAll("paragraph"):
|
---|
233 | try:
|
---|
234 | parts = extractText(paragraph)
|
---|
235 | except:
|
---|
236 | print paragraph
|
---|
237 | raise
|
---|
238 | for i, fragment in enumerate(parts):
|
---|
239 | # wiki2xml does not convert or other HTML entities
|
---|
240 | parts[i] = HTMLDecode(fragment)
|
---|
241 | text.extend(parts)
|
---|
242 | text.append("\n\n")
|
---|
243 |
|
---|
244 | return "".join(text)
|
---|
245 |
|
---|
246 |
|
---|
247 | if __name__ == "__main__":
|
---|
248 | import sys
|
---|
249 | sys.stdout.write(extractWikipediaText(sys.argv[1]).encode("UTF-8"))
|
---|