[1972] | 1 | #!/usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 | # prepare-clauses.pl
|
---|
| 4 | # Version 1.1
|
---|
| 5 |
|
---|
| 6 | # Kea -- Automatic Keyphrase Extraction
|
---|
| 7 | # Copyright 1998-1999 by Gordon Paynter and Eibe Frank
|
---|
| 8 | # Contact [email protected] or [email protected]
|
---|
| 9 | #
|
---|
| 10 | # This program is free software; you can redistribute it and/or modify
|
---|
| 11 | # it under the terms of the GNU General Public License as published by
|
---|
| 12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | # (at your option) any later version.
|
---|
| 14 | #
|
---|
| 15 | # This program is distributed in the hope that it will be useful,
|
---|
| 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | # GNU General Public License for more details.
|
---|
| 19 | #
|
---|
| 20 | # You should have received a copy of the GNU General Public License
|
---|
| 21 | # along with this program; if not, write to the Free Software
|
---|
| 22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 |
|
---|
| 24 | # Version history
|
---|
| 25 | #
|
---|
| 26 | # 1.0 Witten et.al.
|
---|
| 27 | # 1.0.1 Bug: Sentences ending in ".)"
|
---|
| 28 | # 1.0.2 Bug: Sentences ending in ".'" and so on have the same problem.
|
---|
| 29 | # 1.0.3 All "." chanracters removed from the start of tokens.
|
---|
| 30 | # 1.1 First Distribution. GPL added.
|
---|
| 31 |
|
---|
| 32 | if (!$ARGV[0] || !$ARGV[1]) {
|
---|
| 33 | die "usage: gf-prepare-text.pl <input-text> <output-tagged>\n";
|
---|
| 34 | }
|
---|
| 35 |
|
---|
| 36 | $infile = $ARGV[0];
|
---|
| 37 | $outfile = $ARGV[1];
|
---|
| 38 |
|
---|
| 39 | print "prepare-clauses.pl\n";
|
---|
| 40 | print "Input text: $infile\n";
|
---|
| 41 | print "Output clauses: $outfile\n";
|
---|
| 42 |
|
---|
| 43 | open(IN, "<$infile");
|
---|
| 44 | open(OUT, ">$infile.1");
|
---|
| 45 |
|
---|
| 46 |
|
---|
| 47 | # step 1: put every sentence on a new line.
|
---|
| 48 |
|
---|
| 49 | while (<IN>) {
|
---|
| 50 |
|
---|
| 51 | # remove hyphens at end of line
|
---|
| 52 | s/\-\s+$//;
|
---|
| 53 |
|
---|
| 54 | # replace \n and other whitespace with space
|
---|
| 55 | s/\s+/ /g;
|
---|
| 56 | # one sentence per line
|
---|
| 57 | s/[\.\!\?][^A-Za-z0-9]+/. /g;
|
---|
| 58 | s/[\.\!\?] +/\ \n/g;
|
---|
| 59 |
|
---|
| 60 | # space out the punctuation
|
---|
| 61 | # double-hyphens (equivilent to an M-dash) replaced with a clause breaker
|
---|
| 62 | s/\-\-/ : /g;
|
---|
| 63 | # delete apostrophe marks
|
---|
| 64 | s/\'//g;
|
---|
| 65 | # remove all "." charactersfrom the start of a token
|
---|
| 66 | s/ \./ /g;
|
---|
| 67 |
|
---|
| 68 | # space out any other character that is not something we are interested in
|
---|
| 69 | s/([^\w\.\n])/ $1 /g;
|
---|
| 70 | # allow for underscore, which perl considers a "word" character
|
---|
| 71 | s/_/ _ /g;
|
---|
| 72 |
|
---|
| 73 | # squeeze spaces
|
---|
| 74 | s/\ +/ /g;
|
---|
| 75 | s/^\ //g;
|
---|
| 76 |
|
---|
| 77 | # put a fullstop at the end of each line/sentence
|
---|
| 78 | s/\n/.\n/g;
|
---|
| 79 |
|
---|
| 80 | # print
|
---|
| 81 | if ($_ =~ /./) {
|
---|
| 82 | print OUT "$_";
|
---|
| 83 | } else {
|
---|
| 84 | print OUT ".\n";
|
---|
| 85 | }
|
---|
| 86 |
|
---|
| 87 | }
|
---|
| 88 | close(IN);
|
---|
| 89 | close(OUT);
|
---|
| 90 |
|
---|
| 91 | # step 2: the tagger
|
---|
| 92 | # `tag $infile.lines > $infile.tagged`;
|
---|
| 93 |
|
---|
| 94 | # step 3: put a clause on each line and get rid of punctuation
|
---|
| 95 |
|
---|
| 96 | open(IN, "<$infile.1");
|
---|
| 97 | open(OUT, ">$infile.2");
|
---|
| 98 |
|
---|
| 99 | while (<IN>) {
|
---|
| 100 |
|
---|
| 101 | @words = split(/\s+/, $_);
|
---|
| 102 | foreach $w (@words) {
|
---|
| 103 | if ($w =~ /^[\:\;\,\[\]\{\}\(\)]$/) {
|
---|
| 104 | # print new line for clause breakers (. and ? and ! done above)
|
---|
| 105 | print OUT "\n";
|
---|
| 106 | } elsif (($w =~ /^\d+$/) || ($w =~ /^\d+\.\d+$/)) {
|
---|
| 107 | # "pure" numbers are clause breakers
|
---|
| 108 | print OUT "\n";
|
---|
| 109 | } elsif ($w =~ /[A-Za-z]/) {
|
---|
| 110 | # print anything that contains at least one letter
|
---|
| 111 | print OUT " $w";
|
---|
| 112 | } else {
|
---|
| 113 | # simply ignore whatever is left over
|
---|
| 114 | }
|
---|
| 115 | }
|
---|
| 116 | print OUT "\n";
|
---|
| 117 | }
|
---|
| 118 | close(IN);
|
---|
| 119 | close(OUT);
|
---|
| 120 |
|
---|
| 121 | # step 4: pretty things up a tad
|
---|
| 122 |
|
---|
| 123 | `cat $infile.2 | perl -ne "s/^ +//g; print;" | tr -s '\n' > $outfile`;
|
---|
| 124 |
|
---|
| 125 | `rm $infile.1 $infile.2`;
|
---|