Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/gsdl/perllib/Kea-1.1.4/prepare-clauses.pl@ 3161

Last change on this file since 3161 was 1972, checked in by jmt14, 23 years ago
* empty log message *
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 3.2 KB

Rev	Line
[1972]	1	#!/usr/bin/perl -w
	2
	3	# prepare-clauses.pl
	4	# Version 1.1
	5
	6	# Kea -- Automatic Keyphrase Extraction
	7	# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
	8	# Contact [email protected] or [email protected]
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23
	24	# Version history
	25	#
	26	# 1.0 Witten et.al.
	27	# 1.0.1 Bug: Sentences ending in ".)"
	28	# 1.0.2 Bug: Sentences ending in ".'" and so on have the same problem.
	29	# 1.0.3 All "." chanracters removed from the start of tokens.
	30	# 1.1 First Distribution. GPL added.
	31
	32	if (!$ARGV[0] \|\| !$ARGV[1]) {
	33	die "usage: gf-prepare-text.pl <input-text> <output-tagged>\n";
	34	}
	35
	36	$infile = $ARGV[0];
	37	$outfile = $ARGV[1];
	38
	39	print "prepare-clauses.pl\n";
	40	print "Input text: $infile\n";
	41	print "Output clauses: $outfile\n";
	42
	43	open(IN, "<$infile");
	44	open(OUT, ">$infile.1");
	45
	46
	47	# step 1: put every sentence on a new line.
	48
	49	while (<IN>) {
	50
	51	# remove hyphens at end of line
	52	s/\-\s+$//;
	53
	54	# replace \n and other whitespace with space
	55	s/\s+/ /g;
	56	# one sentence per line
	57	s/[\.\!\?][^A-Za-z0-9]+/. /g;
	58	s/[\.\!\?] +/\ \n/g;
	59
	60	# space out the punctuation
	61	# double-hyphens (equivilent to an M-dash) replaced with a clause breaker
	62	s/\-\-/ : /g;
	63	# delete apostrophe marks
	64	s/\'//g;
	65	# remove all "." charactersfrom the start of a token
	66	s/ \./ /g;
	67
	68	# space out any other character that is not something we are interested in
	69	s/([^\w\.\n])/ $1 /g;
	70	# allow for underscore, which perl considers a "word" character
	71	s/_/ _ /g;
	72
	73	# squeeze spaces
	74	s/\ +/ /g;
	75	s/^\ //g;
	76
	77	# put a fullstop at the end of each line/sentence
	78	s/\n/.\n/g;
	79
	80	# print
	81	if ($_ =~ /./) {
	82	print OUT "$_";
	83	} else {
	84	print OUT ".\n";
	85	}
	86
	87	}
	88	close(IN);
	89	close(OUT);
	90
	91	# step 2: the tagger
	92	# `tag $infile.lines > $infile.tagged`;
	93
	94	# step 3: put a clause on each line and get rid of punctuation
	95
	96	open(IN, "<$infile.1");
	97	open(OUT, ">$infile.2");
	98
	99	while (<IN>) {
	100
	101	@words = split(/\s+/, $_);
	102	foreach $w (@words) {
	103	if ($w =~ /^[\:\;\,\[\]\{\}\(\)]$/) {
	104	# print new line for clause breakers (. and ? and ! done above)
	105	print OUT "\n";
	106	} elsif (($w =~ /^\d+$/) \|\| ($w =~ /^\d+\.\d+$/)) {
	107	# "pure" numbers are clause breakers
	108	print OUT "\n";
	109	} elsif ($w =~ /[A-Za-z]/) {
	110	# print anything that contains at least one letter
	111	print OUT " $w";
	112	} else {
	113	# simply ignore whatever is left over
	114	}
	115	}
	116	print OUT "\n";
	117	}
	118	close(IN);
	119	close(OUT);
	120
	121	# step 4: pretty things up a tad
	122
	123	`cat $infile.2 \| perl -ne "s/^ +//g; print;" \| tr -s '\n' > $outfile`;
	124
	125	`rm $infile.1 $infile.2`;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: