Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/Kea-1.1.4/prepare-clauses.pl@ 1972

Last change on this file since 1972 was 1972, checked in by jmt14, 23 years ago
* empty log message *
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 3.2 KB

Line
1	#!/usr/bin/perl -w
2
3	# prepare-clauses.pl
4	# Version 1.1
5
6	# Kea -- Automatic Keyphrase Extraction
7	# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8	# Contact [email protected] or [email protected]
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24	# Version history
25	#
26	# 1.0 Witten et.al.
27	# 1.0.1 Bug: Sentences ending in ".)"
28	# 1.0.2 Bug: Sentences ending in ".'" and so on have the same problem.
29	# 1.0.3 All "." chanracters removed from the start of tokens.
30	# 1.1 First Distribution. GPL added.
31
32	if (!$ARGV[0] \|\| !$ARGV[1]) {
33	die "usage: gf-prepare-text.pl <input-text> <output-tagged>\n";
34	}
35
36	$infile = $ARGV[0];
37	$outfile = $ARGV[1];
38
39	print "prepare-clauses.pl\n";
40	print "Input text: $infile\n";
41	print "Output clauses: $outfile\n";
42
43	open(IN, "<$infile");
44	open(OUT, ">$infile.1");
45
46
47	# step 1: put every sentence on a new line.
48
49	while (<IN>) {
50
51	# remove hyphens at end of line
52	s/\-\s+$//;
53
54	# replace \n and other whitespace with space
55	s/\s+/ /g;
56	# one sentence per line
57	s/[\.\!\?][^A-Za-z0-9]+/. /g;
58	s/[\.\!\?] +/\ \n/g;
59
60	# space out the punctuation
61	# double-hyphens (equivilent to an M-dash) replaced with a clause breaker
62	s/\-\-/ : /g;
63	# delete apostrophe marks
64	s/\'//g;
65	# remove all "." charactersfrom the start of a token
66	s/ \./ /g;
67
68	# space out any other character that is not something we are interested in
69	s/([^\w\.\n])/ $1 /g;
70	# allow for underscore, which perl considers a "word" character
71	s/_/ _ /g;
72
73	# squeeze spaces
74	s/\ +/ /g;
75	s/^\ //g;
76
77	# put a fullstop at the end of each line/sentence
78	s/\n/.\n/g;
79
80	# print
81	if ($_ =~ /./) {
82	print OUT "$_";
83	} else {
84	print OUT ".\n";
85	}
86
87	}
88	close(IN);
89	close(OUT);
90
91	# step 2: the tagger
92	# `tag $infile.lines > $infile.tagged`;
93
94	# step 3: put a clause on each line and get rid of punctuation
95
96	open(IN, "<$infile.1");
97	open(OUT, ">$infile.2");
98
99	while (<IN>) {
100
101	@words = split(/\s+/, $_);
102	foreach $w (@words) {
103	if ($w =~ /^[\:\;\,\[\]\{\}\(\)]$/) {
104	# print new line for clause breakers (. and ? and ! done above)
105	print OUT "\n";
106	} elsif (($w =~ /^\d+$/) \|\| ($w =~ /^\d+\.\d+$/)) {
107	# "pure" numbers are clause breakers
108	print OUT "\n";
109	} elsif ($w =~ /[A-Za-z]/) {
110	# print anything that contains at least one letter
111	print OUT " $w";
112	} else {
113	# simply ignore whatever is left over
114	}
115	}
116	print OUT "\n";
117	}
118	close(IN);
119	close(OUT);
120
121	# step 4: pretty things up a tad
122
123	`cat $infile.2 \| perl -ne "s/^ +//g; print;" \| tr -s '\n' > $outfile`;
124
125	`rm $infile.1 $infile.2`;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: