Context Navigation

extracttop.py@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago
NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!
File size: 795 bytes

Line
1	#!/usr/bin/python
2	# Exact titles from the Wikipedia top articles pages.
3	#
4	# Evan Jones <[email protected]>
5	# April, 2008
6	# Released under a BSD licence.
7	# http://evanjones.ca/software/wikipedia2text.html
8	#
9	# Top articles pages can be found at:
10	# http://en.wikipedia.org/wiki/Wikipedia:Release_Version
11	# http://en.wikipedia.org/wiki/Wikipedia:Version_1.0_Editorial_Team/Release_Version_articles_by_quality2
12
13	import re
14	import sys
15	import urllib
16
17	EXTRACT = re.compile(r'<td><a href="[^"]*/wiki/([^"]+)"')
18
19	for filename in sys.argv[1:]:
20	input = open(filename)
21	for line in input:
22	match = EXTRACT.match(line)
23	if match:
24	# Convert escape sequences
25	title = urllib.unquote(match.group(1))
26	# Convert _ to " "
27	print title.replace("_", " ")

Note: See TracBrowser for help on using the repository browser.