1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | # prepare-clauses.pl
|
---|
4 | # Version 1.1
|
---|
5 |
|
---|
6 | # Kea -- Automatic Keyphrase Extraction
|
---|
7 | # Copyright 1998-1999 by Gordon Paynter and Eibe Frank
|
---|
8 | # Contact [email protected] or [email protected]
|
---|
9 | #
|
---|
10 | # This program is free software; you can redistribute it and/or modify
|
---|
11 | # it under the terms of the GNU General Public License as published by
|
---|
12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
13 | # (at your option) any later version.
|
---|
14 | #
|
---|
15 | # This program is distributed in the hope that it will be useful,
|
---|
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | # GNU General Public License for more details.
|
---|
19 | #
|
---|
20 | # You should have received a copy of the GNU General Public License
|
---|
21 | # along with this program; if not, write to the Free Software
|
---|
22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 |
|
---|
24 | # Version history
|
---|
25 | #
|
---|
26 | # 1.0 Witten et.al.
|
---|
27 | # 1.0.1 Bug: Sentences ending in ".)"
|
---|
28 | # 1.0.2 Bug: Sentences ending in ".'" and so on have the same problem.
|
---|
29 | # 1.0.3 All "." chanracters removed from the start of tokens.
|
---|
30 | # 1.1 First Distribution. GPL added.
|
---|
31 |
|
---|
32 | if (!$ARGV[0] || !$ARGV[1]) {
|
---|
33 | die "usage: gf-prepare-text.pl <input-text> <output-tagged>\n";
|
---|
34 | }
|
---|
35 |
|
---|
36 | $infile = $ARGV[0];
|
---|
37 | $outfile = $ARGV[1];
|
---|
38 |
|
---|
39 | print "prepare-clauses.pl\n";
|
---|
40 | print "Input text: $infile\n";
|
---|
41 | print "Output clauses: $outfile\n";
|
---|
42 |
|
---|
43 | open(IN, "<$infile");
|
---|
44 | open(OUT, ">$infile.1");
|
---|
45 |
|
---|
46 |
|
---|
47 | # step 1: put every sentence on a new line.
|
---|
48 |
|
---|
49 | while (<IN>) {
|
---|
50 |
|
---|
51 | # remove hyphens at end of line
|
---|
52 | s/\-\s+$//;
|
---|
53 |
|
---|
54 | # replace \n and other whitespace with space
|
---|
55 | s/\s+/ /g;
|
---|
56 | # one sentence per line
|
---|
57 | s/[\.\!\?][^A-Za-z0-9]+/. /g;
|
---|
58 | s/[\.\!\?] +/\ \n/g;
|
---|
59 |
|
---|
60 | # space out the punctuation
|
---|
61 | # double-hyphens (equivilent to an M-dash) replaced with a clause breaker
|
---|
62 | s/\-\-/ : /g;
|
---|
63 | # delete apostrophe marks
|
---|
64 | s/\'//g;
|
---|
65 | # remove all "." charactersfrom the start of a token
|
---|
66 | s/ \./ /g;
|
---|
67 |
|
---|
68 | # space out any other character that is not something we are interested in
|
---|
69 | s/([^\w\.\n])/ $1 /g;
|
---|
70 | # allow for underscore, which perl considers a "word" character
|
---|
71 | s/_/ _ /g;
|
---|
72 |
|
---|
73 | # squeeze spaces
|
---|
74 | s/\ +/ /g;
|
---|
75 | s/^\ //g;
|
---|
76 |
|
---|
77 | # put a fullstop at the end of each line/sentence
|
---|
78 | s/\n/.\n/g;
|
---|
79 |
|
---|
80 | # print
|
---|
81 | if ($_ =~ /./) {
|
---|
82 | print OUT "$_";
|
---|
83 | } else {
|
---|
84 | print OUT ".\n";
|
---|
85 | }
|
---|
86 |
|
---|
87 | }
|
---|
88 | close(IN);
|
---|
89 | close(OUT);
|
---|
90 |
|
---|
91 | # step 2: the tagger
|
---|
92 | # `tag $infile.lines > $infile.tagged`;
|
---|
93 |
|
---|
94 | # step 3: put a clause on each line and get rid of punctuation
|
---|
95 |
|
---|
96 | open(IN, "<$infile.1");
|
---|
97 | open(OUT, ">$infile.2");
|
---|
98 |
|
---|
99 | while (<IN>) {
|
---|
100 |
|
---|
101 | @words = split(/\s+/, $_);
|
---|
102 | foreach $w (@words) {
|
---|
103 | if ($w =~ /^[\:\;\,\[\]\{\}\(\)]$/) {
|
---|
104 | # print new line for clause breakers (. and ? and ! done above)
|
---|
105 | print OUT "\n";
|
---|
106 | } elsif (($w =~ /^\d+$/) || ($w =~ /^\d+\.\d+$/)) {
|
---|
107 | # "pure" numbers are clause breakers
|
---|
108 | print OUT "\n";
|
---|
109 | } elsif ($w =~ /[A-Za-z]/) {
|
---|
110 | # print anything that contains at least one letter
|
---|
111 | print OUT " $w";
|
---|
112 | } else {
|
---|
113 | # simply ignore whatever is left over
|
---|
114 | }
|
---|
115 | }
|
---|
116 | print OUT "\n";
|
---|
117 | }
|
---|
118 | close(IN);
|
---|
119 | close(OUT);
|
---|
120 |
|
---|
121 | # step 4: pretty things up a tad
|
---|
122 |
|
---|
123 | `cat $infile.2 | perl -ne "s/^ +//g; print;" | tr -s '\n' > $outfile`;
|
---|
124 |
|
---|
125 | `rm $infile.1 $infile.2`;
|
---|