source: trunk/gsdl/perllib/Kea-1.1.4/prepare-clauses.pl@ 1972

Last change on this file since 1972 was 1972, checked in by jmt14, 23 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.2 KB
Line 
1#!/usr/bin/perl -w
2
3# prepare-clauses.pl
4# Version 1.1
5
6# Kea -- Automatic Keyphrase Extraction
7# Copyright 1998-1999 by Gordon Paynter and Eibe Frank
8# Contact [email protected] or [email protected]
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23
24# Version history
25#
26# 1.0 Witten et.al.
27# 1.0.1 Bug: Sentences ending in ".)"
28# 1.0.2 Bug: Sentences ending in ".'" and so on have the same problem.
29# 1.0.3 All "." chanracters removed from the start of tokens.
30# 1.1 First Distribution. GPL added.
31
32if (!$ARGV[0] || !$ARGV[1]) {
33 die "usage: gf-prepare-text.pl <input-text> <output-tagged>\n";
34}
35
36$infile = $ARGV[0];
37$outfile = $ARGV[1];
38
39print "prepare-clauses.pl\n";
40print "Input text: $infile\n";
41print "Output clauses: $outfile\n";
42
43open(IN, "<$infile");
44open(OUT, ">$infile.1");
45
46
47# step 1: put every sentence on a new line.
48
49while (<IN>) {
50
51 # remove hyphens at end of line
52 s/\-\s+$//;
53
54 # replace \n and other whitespace with space
55 s/\s+/ /g;
56 # one sentence per line
57 s/[\.\!\?][^A-Za-z0-9]+/. /g;
58 s/[\.\!\?] +/\ \n/g;
59
60 # space out the punctuation
61 # double-hyphens (equivilent to an M-dash) replaced with a clause breaker
62 s/\-\-/ : /g;
63 # delete apostrophe marks
64 s/\'//g;
65 # remove all "." charactersfrom the start of a token
66 s/ \./ /g;
67
68 # space out any other character that is not something we are interested in
69 s/([^\w\.\n])/ $1 /g;
70 # allow for underscore, which perl considers a "word" character
71 s/_/ _ /g;
72
73 # squeeze spaces
74 s/\ +/ /g;
75 s/^\ //g;
76
77 # put a fullstop at the end of each line/sentence
78 s/\n/.\n/g;
79
80 # print
81 if ($_ =~ /./) {
82 print OUT "$_";
83 } else {
84 print OUT ".\n";
85 }
86
87}
88close(IN);
89close(OUT);
90
91# step 2: the tagger
92# `tag $infile.lines > $infile.tagged`;
93
94# step 3: put a clause on each line and get rid of punctuation
95
96open(IN, "<$infile.1");
97open(OUT, ">$infile.2");
98
99while (<IN>) {
100
101 @words = split(/\s+/, $_);
102 foreach $w (@words) {
103 if ($w =~ /^[\:\;\,\[\]\{\}\(\)]$/) {
104 # print new line for clause breakers (. and ? and ! done above)
105 print OUT "\n";
106 } elsif (($w =~ /^\d+$/) || ($w =~ /^\d+\.\d+$/)) {
107 # "pure" numbers are clause breakers
108 print OUT "\n";
109 } elsif ($w =~ /[A-Za-z]/) {
110 # print anything that contains at least one letter
111 print OUT " $w";
112 } else {
113 # simply ignore whatever is left over
114 }
115 }
116 print OUT "\n";
117}
118close(IN);
119close(OUT);
120
121# step 4: pretty things up a tad
122
123`cat $infile.2 | perl -ne "s/^ +//g; print;" | tr -s '\n' > $outfile`;
124
125`rm $infile.1 $infile.2`;
Note: See TracBrowser for help on using the repository browser.