source: gs2-extensions/ngramj/src/wiki/wikipedia2text/extracttop.py@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 795 bytes
Line 
1#!/usr/bin/python
2# Exact titles from the Wikipedia top articles pages.
3#
4# Evan Jones <[email protected]>
5# April, 2008
6# Released under a BSD licence.
7# http://evanjones.ca/software/wikipedia2text.html
8#
9# Top articles pages can be found at:
10# http://en.wikipedia.org/wiki/Wikipedia:Release_Version
11# http://en.wikipedia.org/wiki/Wikipedia:Version_1.0_Editorial_Team/Release_Version_articles_by_quality2
12
13import re
14import sys
15import urllib
16
17EXTRACT = re.compile(r'<td><a href="[^"]*/wiki/([^"]+)"')
18
19for filename in sys.argv[1:]:
20 input = open(filename)
21 for line in input:
22 match = EXTRACT.match(line)
23 if match:
24 # Convert escape sequences
25 title = urllib.unquote(match.group(1))
26 # Convert _ to " "
27 print title.replace("_", " ")
Note: See TracBrowser for help on using the repository browser.