########################################################################### # # phind.pm -- the Phind classifier # # Copyright (C) 2000 Gordon W. Paynter # Copyright (C) 2000 New Zealand Digital Library Project # # # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # The phind clasifier plugin # # options are: # title=Title The title field for this classification # text=fields The text used to build the phrase hierarchy # phindexdir=directory Location of phind index files # verbosity=num Control amount of output # untidy=true Do not clean up intermediate files # suffixmode=num Mode of suffix program (0 = all phrases, 1 = stopword) # suffixsize=num Number of symbols available to suffix program # How a classifier works. # # When a classifier is requested in the collect.cfg file, buildcol creates a # new classifier object (such as the one defined in theis file) and later # passes each document object to the classifier in turn. Four functions are # used: # # 1. "new" is called before the documents are processed to set up the # classifier. # # 2. "init" is called after buildcol.pl has created the indexes etc but # before the documents are classified in order that the classifier might # set any varioables it requiers, etc. # # 3. "classify" is called once for each document object. The classifier # "classifies" each document and updates its local data accordingly. # # 4. "get_classify_info" is called after every document has been # classified. It collates the information about the documents and # stores a reference to the classifier so that Greenstone can later # display it. package phind; use BasClas; use util; sub BEGIN { @ISA = ('BasClas'); } # Define delimiter symbols - this should be abstracted out someplace my $colstart = "COLLECTIONSTART"; my $colend = "COLLECTIONEND"; my $doclimit = "DOCUMENTLIMIT"; my $senlimit = "SENTENCELIMIT"; my @delimiters = ($colstart, $colend, $doclimit, $senlimit); # Create a new phind browser based on the options in collect.cfg sub new { my ($class, @options) = @_; my $self = new BasClas ($class, @_); my $out = pop @options; # Phind installation check # The phind phrase browser is research software and is not installed # by defualt. If the user attepts to use it we warn them that it's a # bit dodgy, then tell them how to install it. If they can do that # and get all the files in place, then we let them proceed. print $out "The Phind classifier for Greenstone.\n"; print $out "Checking the phind phrase browser requirements...\n"; # Make sure we're not in windows if ($ENV{'GSDLOS'} =~ /windows/i) { print STDERR "Phind currently only works under Unix"; exit(1); } # Ensure the Phind generate scripts are in place my $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "suffix"); my $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "generate"); if (!(-e $file1)) { print STDERR "The phind \"suffix\" program is not installed. "; print STDERR "To install it, change to the directory\n"; print STDERR " $src\n"; print STDERR "and type \"make install-phind\".\n\n"; exit(1); } # Ensure the Phind CGI script is in place $file1 = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "phindcgi"); $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "host"); if (!(-e $file1)) { print STDERR "The phind CGI program is not installed. "; print STDERR "To install it, change to the directory\n"; print STDERR " $src\n"; print STDERR "and type \"make install-phind\".\n\n"; exit(1); } # Ensure the Phind Java applet is in place $src = &util::filename_cat($ENV{'GSDLHOME'}, "src", "phind", "client"); $file1 = &util::filename_cat($src, "Phind.class"); if (!(-e $file1)) { print STDERR "The phind Java classes are not compiled. "; print STDERR "To compile them, change to the directory\n"; print STDERR " $src\n"; print STDERR "and use your Java compiler to compile Phind.java.\n"; print STDERR "(if you have Java 1.2 installed, type \"javac Phind.java\")\n\n"; exit(1); } # The installation appears OK - set up the classifier my $collection = $ENV{'GSDLCOLLECTION'}; my $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"phindex"); my $language = "english"; my $title = "Topic"; my $indexes = "section:Title,section:text"; my $suffixmode = 1; my $suffixsize = 40000000; my $verbosity = 2; my $untidy = 0; # parse the options foreach $option (@options) { if ($option =~ /^text=(.*)$/i) { $indexes = $1; } elsif ($option =~ /^title=(.*)$/i) { $title = $1; } elsif ($option =~ /^phindexdir=(.*)$/i) { $phindexdir = $1; } elsif ($option =~ /^suffixsize=(.*)$/i) { $suffixsize = $1; } elsif ($option =~ /^suffixmode=(.*)$/i) { $suffixmode = $1; } elsif ($option =~ /^verbosity=(.*)$/i) { $verbosity = $1; } elsif ($option =~ /^untidy/i) { $untidy = 1; } } $self->{'collection'} = $collection; $self->{'title'} = $title; $self->{'indexes'} = $indexes; $self->{'suffixmode'} = $suffixmode; $self->{'suffixsize'} = $suffixsize; $self->{'verbosity'} = $verbosity; $self->{'untidy'} = $untidy; # limit languages $language =~ s/,/\|/g; $self->{'language_exp'} = $language; $self->{'delimiter'} = $delimiter; # reset phindex directory if (-e "$phindexdir") { &util::rm_r("$phindexdir"); } &util::mk_dir("$phindexdir"); $self->{'phindexdir'} = $phindexdir; return bless $self, $class; } # Initialise the phind classifier sub init { my $self = shift (@_); # open filehandles for documents and text my $phindexdir = $self->{'phindexdir'}; my $clausefile = &util::filename_cat("$phindexdir", "clauses"); &util::rm($clausefile) if (-e $clausefile); open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!"; $self->{'txthandle'} = TEXT; my $docfile = &util::filename_cat("$phindexdir", "docs.txt"); &util::rm($docfile) if (-e $docfile); open(DOCS, ">$docfile") || die "Cannot open $docfile: $!"; $self->{'dochandle'} = DOCS; } # Classify each document. # # Each document is passed here in turn. The classifier extracts the # text of each and stores it in the clauses file. Document details are # stored in the docs.txt file. sub classify { my $self = shift (@_); my ($doc_obj) = @_; my $verbosity = $self->{'verbosity'}; my $top_section = $doc_obj->get_top_section(); my $title = $doc_obj->get_metadata_element ($top_section, "Title"); print "process: $title\n" if ($verbosity > 2); # only consider english-language files my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language"); my $phrlanguage = $self->{'language_exp'}; return if ($doclanguage && ($doclanguage !~ /$phrlanguage/i)); # record this file my $total++; print "file $total: $file\n" if ($self->{'$verbosity'}); # Store document details my $OID = $doc_obj->get_OID(); $OID = "NULL" unless defined $OID; my $dochandle = $self->{'dochandle'}; print $dochandle "\t$OID\t$title\n"; # Store the text occuring in this object # output the document delimiter my $txthandle = $self->{'txthandle'}; print $txthandle "$doclimit\n"; # iterarate over the required indexes and store their text my $indexes = $self->{'indexes'}; my $text = ""; my ($part, $level, $field, $section, $data); foreach $part (split(/,/, $indexes)) { # Each field has a level and a data element ((e.g. document:Title) ($level, $field) = split(/:/, $part); die unless ($level && $field); # Extract the text from every section # (In phind, document:text and section:text are equivalent) if ($field eq "text") { $data = ""; $section = $doc_obj->get_top_section(); while (defined($section)) { $data .= $doc_obj->get_text($section) . "\n"; $section = $doc_obj->get_next_section($section); } $text .= convert_gml_to_tokens($data) . "\n"; } # Extract a metadata field from a document elsif ($level eq "document") { $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field); $text .= convert_gml_to_tokens($data) . "\n"; } # Extract metadata from every section in a document elsif ($level eq "section") { $data = ""; $section = $doc_obj->get_top_section(); while (defined($section)) { $data .= $doc_obj->get_metadata_element($section, $field) . "\n"; $section = $doc_obj->get_next_section($section); } $text .= convert_gml_to_tokens($data) . "\n"; } # Some sort of specification which I don't understand else { die "Unknown level ($level) in phind index ($part)\n"; } } # output the text $text =~ tr/\n//s; print $txthandle "$text"; } # Construct the classifier from the information already gathered # # When get_classify_info is called, the clauses and docs.txt files have # already been constructed in the phindex directory. This function will # translate them into compressed, indexed MGPP files that can be read by # the phindcgi script. It will also register our classifier so that it # shows up in thenavigation bar. sub get_classify_info { my $self = shift (@_); my $verbosity = $self->{'verbosity'}; my $phindexdir = $self->{'phindexdir'}; my $language = "english"; if ($verbosity) { print STDERR "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n"; } # Construct phind indexes my $suffixmode = $self->{'suffixmode'}; my $suffixsize = $self->{'suffixsize'}; my ($command, $status); # Generate the vocabulary, symbol statistics, and numbers file # from the clauses file print "\nExtracting vocabulary and statistics\n" if $verbosity; &extract_vocabulary($phindexdir, $language, $verbosity); # Use the suffix program to generate the phindex/phrases file print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity; &execute("suffix $phindexdir $suffixsize $suffixmode", $verbosity); # Create the phrase file and put phrase numbers in phindex/phrases print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity; &renumber_phrases("$phindexdir", $verbosity); # Create the mg phrase database my $mgpp = &util::filename_cat($ENV{'GSDLHOME'}, "src", "mgpp"); my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes"); my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict"); my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build"); my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build"); my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict"); my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx"); print "\nCreating phrase databases\n"; my $mg_input = &util::filename_cat($phindexdir, "pdata.txt"); my $mg_stem = "pdata"; &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity); &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity); &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity); # create the mg index of words print "\nCreating word-level search indexes\n"; $mg_input = &util::filename_cat($phindexdir, "pword.txt"); $mg_stem = "pword"; &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity); &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity); &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity); &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity); &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity); &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity); &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity); &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity); &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity); # create the mg document information database print "\nCreating document information databases\n"; $mg_input = &util::filename_cat($phindexdir, "docs.txt"); $mg_stem = "docs"; &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity); &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity); &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity); # Tidy up stray files if (!$untidy) { print "\nCleaning up\n" if ($verbosity > 2); &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers", "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats", "$phindexdir/phrases", "$phindexdir/docs.txt", "$phindexdir/pdata.txt", "$phindexdir/pword.txt"); my $outfile = 1; while (-e "$phindexdir/outPhrase.$outfile") { &util::rm("$phindexdir/outPhrase.$outfile"); $outfile++; } } # Insert the classifier into.... what? my $collection = $self->{'collection'}; my $url = "library?a=p&p=phind&c=$collection"; my %classifyinfo = ('thistype'=>'Invisible', 'childtype'=>'Phind', 'Title'=>$self->{'title'}, 'contains'=>[]); push (@{$classifyinfo{'contains'}}, {'OID'=>$url}); return \%classifyinfo; } sub convert_gml_to_tokens { $_ = shift @_; # FIRST, remove GML tags # Replace all whitespace with a simple space s/\s+/ /gs; # Remove everything that is in a tag s/\s*

\s*/ PARAGRAPHBREAK /isg; s/\s*
\s*/ LINEBREAK /isg; s/<[^>]*>/ /sg; # Now we have the text, but it may contain HTML # elements coded as > etc. Remove these tags. s/<//sg; s/\s+/ /sg; s/\s*

\s*/ PARAGRAPHBREAK /isg; s/\s*
\s*/ LINEBREAK /isg; s/<[^>]*>/ /sg; # remove & and other miscellaneous markup tags s/&/&/sg; s/<//sg; s/&/&/sg; # replace

and
placeholders with carriage returns s/PARAGRAPHBREAK/\n/sg; s/LINEBREAK/\n/sg; # Exceptional punctuation # # We make special cases of some punctuation # remove any apostrophe that indicates omitted letters s/(\w+)\'(\w*\s)/ $1$2 /g; # remove period that appears in a person's initals s/\s([A-Z])\./ $1 /g; # replace hyphens in hypheanted words and names with a space s/([A-Za-z])-\s*([A-Za-z])/$1 $2/g; # Convert the remaining text to "clause format", # This means removing all excess punctuation and garbage text, # normalising valid punctuation to fullstops and commas, # then putting one cluse on each line. # Insert newline when the end of a sentence is detected # (delimter is: "[\.\?\!]\s") s/\s*[\.\?\!]\s+/\n/g; # split numbers after four digits s/(\d\d\d\d)/$1 /g; # split words after 32 characters # squash repeated punctuation tr/A-Za-z0-9 //cs; # save email addresses # s/\w+@\w+\.[\w\.]+/EMAIL/g; # normalise clause breaks (mostly punctuation symbols) to commas s/[^A-Za-z0-9 \n]+/ , /g; # Remove repeated commas, and replace with newline s/\s*,[, ]+/\n/g; # remove extra whitespace s/ +/ /sg; s/^\s+//mg; s/\s*$/\n/mg; # remove lines that contain one word or less s/^\w*$//mg; s/^\s*$//mg; tr/\n//s; return $_; } # Execute a system command sub execute { my ($command, $verbosity) = @_; print "Executing: $command\n" if ($verbosity > 2); my $status = system($command); if ($status != 0) { print STDERR "phindgen.pl - Error executing $command: $!\n"; exit($status); } } # Generate the vocabulary, symbol statistics, and numbers file from the # clauses file. This is legacy code, so is a bit messy and probably wont # run under windows. sub extract_vocabulary { my ($phindex_dir, $language, $verbosity) = @_; my ($w, $l, $line, $word); my ($first_delimiter, $last_delimiter, $first_stopword, $last_stopword, $first_extractword, $last_extractword, $first_contentword, $last_contentword, $phrasedelimiter); my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword); my %symbol; my (%freq); print "Calculating vocabulary\n" if ($verbosity > 1); # Read and store the stopwords my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`; my %stopwords; foreach my $w (split(/\s+/, $words)) { $l = lc($w); $stopwords{$l} = $w; } # Read and store the thesaurus terms $use_thesaurus = 0; my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind", "$language", "agrovoc.lex"); if (-e "$lex_file") { open(TH, "<$lex_file"); while() { s/^\d+ //; s/\(.*\)//; foreach my $w (split(/\s+/, $_)) { $thesaurus{lc($w)} = $w; } } close TH; $use_thesaurus = 1; } # Read words in the text and count occurences open(TXT, "<$phindex_dir/clauses"); my @words; while() { $line = $_; next unless ($line =~ /./); @words = split(/\s+/, $line); foreach $w (@words) { $l = lc($w); $w = $l if ((defined $stopwords{$l}) || (defined $thesaurus{$l})); $freq{$w}++; } $freq{$senlimit}++; } # Calculate the "best" form of each word my (%bestform, %totalfreq, %bestfreq); foreach $w (sort (keys %freq)) { $l = lc($w); # totalfreq is the number of times a term appears in any form $totalfreq{$l} += $freq{$w}; if (defined $stopwords{$l}) { $bestform{$l} = $stopwords{$l}; } elsif (defined $thesaurus{$l}) { $bestform{$l} = $thesaurus{$l}; } elsif (!$bestform{$l} || ($freq{$w} > $bestfreq{$l})) { $bestfreq{$l} = $freq{$w}; $bestform{$l} = $w; } } undef %freq; undef %bestfreq; # Assign symbol numbers to tokens my $nextsymbol = 1; my (@vocab); # Delimiters $first_delimiter = 1; foreach $word (@delimiters) { $word = lc($word); $bestform{$word} = uc($word); $vocab[$nextsymbol] = $word; $symbol{$word} = $nextsymbol; $nextsymbol++; } $last_delimiter = $nextsymbol - 1; # Stopwords $first_stopword = $nextsymbol; foreach my $word (sort keys %stopwords) { # don't incluse stopword unless it occurs in the text $word = lc($word); next unless ($totalfreq{$word}); next if ($symbol{$word}); $vocab[$nextsymbol] = $word; $symbol{$word} = $nextsymbol; $nextsymbol++; } $last_stopword = $nextsymbol - 1; $first_contentword = $nextsymbol; # Thesaurus terms if ($use_thesaurus) { $first_thesaurusword = $nextsymbol; foreach my $word (sort keys %thesaurus) { $word = lc($word); next if ($symbol{$word}); $bestform{$word} = $thesaurus{$word}; $vocab[$nextsymbol] = $word; $symbol{$word} = $nextsymbol; $nextsymbol++; } $last_thesaurusword = $nextsymbol - 1; } # Other content words $first_extractword = $nextsymbol; foreach my $word (sort (keys %bestform)) { next if ($symbol{$word}); $vocab[$nextsymbol] = $word; $symbol{$word} = $nextsymbol; $nextsymbol++; } $last_extractword = $nextsymbol - 1; $last_contentword = $nextsymbol - 1; # Outut the words print "Saving vocabulary in $phindex_dir/clauses.vocab\n" if ($verbosity > 1); open(VOC, ">$phindex_dir/clauses.vocab"); for (my $i = 1; $i < $nextsymbol; $i++) { $w = $vocab[$i]; print VOC "$bestform{$w}\n"; $totalfreq{$w} = 0 unless ($totalfreq{$w}); } close VOC; # Output statistics about the vocablary print "Saving statistics in $phindex_dir/clauses.stats\n" if ($verbosity > 1); &util::rm("$phindex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats"); open(STAT, ">$phindex_dir/clauses.stats") || die "Cannot open $phindex_dir/clauses.stats: $!"; print STAT "first_delimiter $first_delimiter\n"; print STAT "last_delimiter $last_delimiter\n"; print STAT "first_stopword $first_stopword\n"; print STAT "last_stopword $last_stopword\n"; if ($use_thesaurus) { print STAT "first_thesaurusword $first_thesaurusword\n"; print STAT "last_thesaurusword $last_thesaurusword\n"; } print STAT "first_extractword $first_extractword\n"; print STAT "last_extractword $last_extractword\n"; print STAT "first_contentword $first_contentword\n"; print STAT "last_contentword $last_contentword\n"; print STAT "first_symbol $first_delimiter\n"; print STAT "last_symbol $last_contentword\n"; print STAT "first_word $first_stopword\n"; print STAT "last_word $last_contentword\n"; close STAT; undef @vocab; # Save text as symbol numbers print "Saving text as numbers in $phindex_dir/clauses.numbers\n" if ($verbosity > 1); open(TXT, "<$phindex_dir/clauses"); open(NUM, ">$phindex_dir/clauses.numbers"); $phrasedelimiter = $symbol{lc($senlimit)}; print NUM "$symbol{lc($colstart)}\n"; # set up the special symbols that delimit documents and sentences while() { # split sentence into a list of tokens $line = $_; next unless ($line =~ /./); @words = split(/\s+/, $line); # output one token at a time foreach $word (@words) { $word = lc($word); print NUM "$symbol{$word}\n"; } # output phrase delimiter print NUM "$phrasedelimiter\n"; } print NUM "$symbol{lc($colend)}\n"; } # Prepare the phrases file to be input to mgpp. # This means renumbering the phrases in order of decreasing frequency. # This is legacy code, and a little ugly, and may be unix-specific # (particularly the sort command). sub renumber_phrases { my ($phindex_dir, $verbosity) = @_; # Sort the phrases into order of increasing frequency # This means the expansions will be sorted correctly later on. print "Sorting phrases into freq order\n" if ($verbosity); system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases"); my @symbol; # Read the vocabulary print "Reading the vocabulary\n" if ($verbosity); open(V, "<$phindex_dir/clauses.vocab") || die "Cannot open $phindex_dir/clauses.vocab: $!"; my $i = 1; while() { chomp; $symbol[$i++] = $_; } # Create file for phrase data # # The phrases file looks something like this # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361 # The first field on each line is a unique phrase identifier. # We need to calculate phrase numbers for each phrase print "Calculate phrase numbers\n" if ($verbosity); my %phrasenumber; my $nextphrase = 1; my ($line); open(IN, "<$phindex_dir/phrases"); while() { # read the line chomp; $line = $_; # we're only interested in the first field $line =~ s/:.*//; # get a phrase number for this line $phrasenumber{$line} = $nextphrase; $nextphrase++; } # Now we create a new phrase file using phrase numbers, not the old IDs. print "Format phrase data for MGPP\n" if ($verbosity); open(IN, "<$phindex_dir/phrases"); open(DATA, ">$phindex_dir/pdata.txt"); open(IDX, ">$phindex_dir/pword.txt"); my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word); my @fields; my @documents; my (@newexp, $k, $n); my $linenumber = 0; while() { # read the line chomp; $line = $_; @fields = split(/:/, $line); # get a phrase number for this line $key = shift @fields; die unless (defined($phrasenumber{$key})); $num = $phrasenumber{$key}; # get the text of the phrase $text = shift @fields; $text =~ s/s(\d+)/$symbol[$1]/g; if ($text =~ / /) { $word = ""; } else { $word = $text; } $linenumber++; if ($linenumber % 1000 == 0) { print "line $linenumber:\t$num\t$key\t($text)\n" if ($verbosity > 2); } print "$num: $key\t($text)\n" if ($verbosity > 3); # get the phrase frequency $tf = shift @fields; # get the number of expansions $countexp = shift @fields; # get the expansions and convert them into phrase numbers $expansions = shift @fields; @newexp = (); foreach $k (split(/,/, $expansions)) { die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k})); $n = $phrasenumber{$k}; push @newexp, $n; } @newexp = sort numerically @newexp; # get the number of documents $countdocs = shift @fields; # get the documents $documents = shift @fields; $documents =~ s/d//g; @documents = split(/;/, $documents); @documents = sort by_frequency @documents; # output the phrase data print DATA ""; print DATA "$num:$text:$tf:$countexp:$countdocs:"; print DATA join(",", @newexp), ":", join(";", @documents), "\n"; # output the word index search data print IDX "$word\n"; } } # sort routines used to renumber phrases sub numerically { $a <=> $b } sub by_frequency { my $fa = 1; if ($a =~ /,/) { $fa = $a; $fa =~ s/\d+,//; } my $fb = 1; if ($b =~ /,/) { $fb = $b; $fb =~ s/\d+,//; } return ($fb <=> $fa); } 1;