Changeset 1808
- Timestamp:
- 2000-12-19T12:03:38+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r1803 r1808 31 31 # options are: 32 32 # button=Name The label for the classifiers button in the 33 # navigation bar (defaults to " Topic").33 # navigation bar (defaults to "Phrase"). 34 34 # title=Title The metadata field used to describe each document 35 35 # (defaults to "Title"). 36 36 # text=fields The text used to build the phrase hierarchy 37 37 # (defaults to "section:Title,section:text"). 38 # phinddir=directory Location of phind index files38 # phinddir=directory Location of phind index files 39 39 # verbosity=num Control amount of output 40 40 # untidy=true Do not clean up intermediate files 41 41 # suffixmode=num Mode of suffix program (0 = all phrases, 1 = stopword) 42 42 # suffixsize=num Number of symbols available to suffix program 43 43 # savephrases=filename If set, phrase infomation will be stored in filename 44 # as text. (By defualt, it is not set.) 44 45 45 46 # How a classifier works. … … 155 156 my $suffixmode = 1; 156 157 my $suffixsize = 40000000; 158 my $savephrases = ""; 157 159 158 160 my $verbosity = 2; … … 174 176 } elsif ($option =~ /^phinddir=(.*)$/i) { 175 177 $phinddir = $1; 178 } elsif ($option =~ /^savephrases=(.*)$/i) { 179 $savephrases = $1; 176 180 } elsif ($option =~ /^suffixsize=(.*)$/i) { 177 181 $suffixsize = $1; 178 182 } elsif ($option =~ /^suffixmode=(.*)$/i) { 179 183 $suffixmode = $1; 184 } elsif ($option =~ /^untidy/i) { 185 $untidy = 1; 180 186 } elsif ($option =~ /^verbosity=(.*)$/i) { 181 187 $verbosity = $1; 182 } elsif ($option =~ /^untidy/i) {183 $untidy = 1;184 188 } 185 189 } … … 194 198 $self->{'suffixmode'} = $suffixmode; 195 199 $self->{'suffixsize'} = $suffixsize; 200 $self->{'savephrases'} = $savephrases if ($savephrases); 196 201 197 202 # limit languages … … 380 385 # Create the phrase file and put phrase numbers in phind/phrases 381 386 print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity; 382 &renumber_phrases( "$phinddir", $verbosity);387 &renumber_phrases($self); 383 388 384 389 # Create the mg phrase database … … 427 432 428 433 # Tidy up stray files 429 if (!$ untidy) {434 if (!$self->{'untidy'}) { 430 435 print "\nCleaning up\n" if ($verbosity > 2); 431 436 &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers", … … 795 800 796 801 sub renumber_phrases { 797 my ($phind_dir, $verbosity) = @_; 802 my $self = shift (@_); 803 804 my $verbosity = $self->{'verbosity'}; 805 my $phind_dir = $self->{'phinddir'}; 806 807 my $savephrases = 0; 808 $savephrases = $self->{'savephrases'} if (defined($self->{'savephrases'})); 809 810 798 811 799 812 # Sort the phrases into order of increasing frequency … … 802 815 system("sort -rnt ':' +2 -o $phind_dir/phrases $phind_dir/phrases"); 803 816 817 # Read the vocabulary 804 818 my @symbol; 805 806 # Read the vocabulary807 819 print "Reading the vocabulary\n" if ($verbosity); 808 820 open(V, "<$phind_dir/clauses.vocab") … … 846 858 } 847 859 848 860 849 861 # Now we create a new phrase file using phrase numbers, not the old IDs. 850 862 print "Format phrase data for MGPP\n" if ($verbosity); 851 863 864 # Open the basic files 852 865 open(IN, "<$phind_dir/phrases"); 853 866 open(DATA, ">$phind_dir/pdata.txt"); 854 867 open(IDX, ">$phind_dir/pword.txt"); 855 868 869 # We may want to save the phrases in a separate text file 870 if ($savephrases) { 871 print "Saving phrases in $savephrases\n" if ($verbosity); 872 open(SAVE, ">$savephrases"); 873 } 874 856 875 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word); 857 876 my @fields; … … 921 940 print IDX "<Document>$word\n"; 922 941 923 924 } 942 # output the phrases to a text file 943 if ($savephrases) { 944 print SAVE "$tf\t$countdocs\t$text\n"; 945 } 946 947 } 948 949 close SAVE if ($savephrases); 925 950 } 926 951
Note:
See TracChangeset
for help on using the changeset viewer.