#!/usr/bin/python # Take a MediaWiki article in XML format, as produced by wiki2xml, and extract # plain text. Uses the BeautifulSoup parser, since wiki2xml's output is not XML. # # Evan Jones # April, 2008 # Released under a BSD licence. # http://evanjones.ca/software/wikipedia2text.html import htmlentitydefs import re import BeautifulSoup # By default, BeautifulStoneSoup doesn't allow nesting class WikiSoup(BeautifulSoup.BeautifulStoneSoup): # Allow nesting most tags except and NESTABLE_TAGS = { # Forces a tag to pop back up to
. "heading": ["article"], "link": [], #Maybe only allow these under "link"? "target": [], "part": [], "trail": [], "extension": [], "template": [], "arg": [], "list": [], "listitem": [], "table": [], "tablerow": [], "tablecell": [], "bold": [], "italics": [], "sup": [], "sub": [], "preblock": [], "preline": [], } SELF_CLOSING_TAGS = { "space": None } def __init__(self, data): BeautifulSoup.BeautifulStoneSoup.__init__(self, data, convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES) # Set of plain text tags: we will extract text from inside these tags PLAIN_TAGS = set([ "bold", "italics", "sup", "sub", "preblock", "preline", "templatevar", # Used for some quote templates "part", ]) def extractLinkText(linkNode): """Extract text from a tag.""" assert linkNode.name == "link" try: if len(linkNode.contents) == 0: # return None first = linkNode.contents[0] if isinstance(first, BeautifulSoup.NavigableString): # text assert linkNode["type"] == "external" # External links could contain tags such as return "".join(extractText(linkNode)) assert first.name == "target" # can contain other tags, in particular