Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1829

Timestamp:

2001-01-11T10:12:20+13:00 (23 years ago)

Author:

paynter

Message:

Accept a "thesaurus=name" option that identifies a thesaurus in a
collections etc directory. In corporate the thesaurus data into the MGPP
output, including phrase entries for every thesaurus term, and thesaurus
link information encoding the thesaurus structure. At the same time I've
changed the way phrases are numbered - they are no longer sorted in order
of decreasing frequency - which has allowed me to get rid of the call to
Unix sort, which was the main impediment to a Windows version.

File:

: 1 edited

trunk/gsdl/perllib/classify/phind.pm (modified) (19 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/classify/phind.pm

-              r1808
+              r1829
 #   savephrases=filename  If set, phrase infomation will be stored in filename
 #                         as text. (By defualt, it is not set.)
+#   thesaurus=name        Name of a thesaurus stred in phind format in etc dir.
 # How a classifier works.
 …
     my $suffixmode = 1;
     my $suffixsize = 40000000;
     my $savephrases = "";
+    my $savephrases = 0;
     my $verbosity = 2;
     my $untidy = 0;
+    my $thesaurus = "";
     # parse the options
 …
     } elsif ($option =~ /^suffixmode=(.*)$/i) {
         $suffixmode = $1;
+    } elsif ($option =~ /^thesaurus=(.*)$/i) {
+        $thesaurus = $1;
     } elsif ($option =~ /^untidy/i) {
         $untidy = 1;
 …
     $self->{'suffixmode'} = $suffixmode;
     $self->{'suffixsize'} = $suffixsize;
+    $self->{'savephrases'} = $savephrases if ($savephrases);
+    $self->{'savephrases'} = $savephrases;
+    $self->{'thesaurus'} = $thesaurus;
     # limit languages
 …
     $self->{'delimiter'} = $delimiter;
+    # collection directory
+    $self->{'collectiondir'} = $ENV{'GSDLCOLLECTDIR'};
     # build directory
     if (!$builddir) {
 …
     $self->{'verbosity'} = $verbosity;
     $self->{'untidy'} = $untidy;
+    $self->{'out'} = $out;
     return bless $self, $class;
 …
     # from the clauses file
     print "\nExtracting vocabulary and statistics\n" if $verbosity;
     &extract_vocabulary($phinddir, $language, $verbosity);
+    &extract_vocabulary($self);
     # Use the suffix program to generate the phind/phrases file
 …
 sub extract_vocabulary {
+    my ($phind_dir, $language, $verbosity) = @_;
+    my ($self) = @_;
+    my $verbosity = $self->{'verbosity'};
+    my $out = $self->{'out'};
+    my $language = "english"; # $self->{'language'};
+    my $collectiondir = $self->{'collectiondir'};
+    my $phinddir = $self->{'phinddir'};
     my ($w, $l, $line, $word);
     my ($first_delimiter, $last_delimiter,
     $first_stopword, $last_stopword,
 …
     $first_contentword, $last_contentword,
     $phrasedelimiter);
+    my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
+    my $thesaurus = $self->{'thesaurus'};
+    my ($thesaurus_links, $thesaurus_terms,
+    %thesaurus, $first_thesaurusword, $last_thesaurusword);
     my %symbol;
     my (%freq);
     print "Calculating vocabulary\n" if ($verbosity > 1);
+    print $out "Calculating vocabulary\n" if ($verbosity > 1);
     # Read and store the stopwords
     my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`;
     my %stopwords;
     foreach my $w (split(/\s+/, $words)) {
+    foreach $w (split(/\s+/, $words)) {
     $l = lc($w);
     $stopwords{$l} = $w;
+    }
+    # Read and store the thesaurus terms
+    $use_thesaurus = 0;
+    my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
+                       "$language", "agrovoc.lex");
+    if (-e "$lex_file") {
+    open(TH, "<$lex_file");
+    # Read thesaurus information
+    if ($thesaurus) {
+    # Ensure both link and term files exist
+    $thesaurus_links = &util::filename_cat($collectiondir, "etc", "$thesaurus.lnk");
+    die "Cannot find thesaurus link file" unless (-e "$thesaurus_links");
+    $thesaurus_terms = &util::filename_cat($collectiondir, "etc", "$thesaurus.EN");
+    die "Cannot find thesaurus term file" unless (-e "$thesaurus_terms");
+    # Read the thesaurus terms
+    open(TH, "<$thesaurus_terms");
     while(<TH>) {
         s/^\d+ //;
         s/\(.*\)//;
         foreach my $w (split(/\s+/, $_)) {
+        foreach $w (split(/\s+/, $_)) {
         $thesaurus{lc($w)} = $w;
+        }
+    }
     close TH;
-    $use_thesaurus = 1;
+    }
     # Read words in the text and count occurences
     open(TXT, "<$phind_dir/clauses");
+    open(TXT, "<$phinddir/clauses");
     my @words;
 …
     # Thesaurus terms
     if ($use_thesaurus) {
+    if ($thesaurus) {
     $first_thesaurusword = $nextsymbol;
 …
     # Outut the words
     print "Saving vocabulary in $phind_dir/clauses.vocab\n" if ($verbosity > 1);
     open(VOC, ">$phind_dir/clauses.vocab");
+    print $out "Saving vocabulary in $phinddir/clauses.vocab\n" if ($verbosity > 1);
+    open(VOC, ">$phinddir/clauses.vocab");
     for (my $i = 1; $i < $nextsymbol; $i++) {
 …
+    # Create statistics file
     # Output statistics about the vocablary
+    print "Saving statistics in $phind_dir/clauses.stats\n" if ($verbosity > 1);
+    &util::rm("$phind_dir/clauses.stats") if (-e "$phind_dir/clauses.stats");
+    open(STAT, ">$phind_dir/clauses.stats")
+    || die "Cannot open $phind_dir/clauses.stats: $!";
+    print $out "Saving statistics in $phinddir/clauses.stats\n" if ($verbosity > 1);
+    &util::rm("$phinddir/clauses.stats") if (-e "$phinddir/clauses.stats");
+    open(STAT, ">$phinddir/clauses.stats")
+    || die "Cannot open $phinddir/clauses.stats: $!";
     print STAT "first_delimiter $first_delimiter\n";
 …
     print STAT "first_stopword $first_stopword\n";
     print STAT "last_stopword $last_stopword\n";
     if ($use_thesaurus) {
+    if ($thesaurus) {
     print STAT "first_thesaurusword $first_thesaurusword\n";
     print STAT "last_thesaurusword $last_thesaurusword\n";
 …
+    # Create numbers file
     # Save text as symbol numbers
     print "Saving text as numbers in $phind_dir/clauses.numbers\n" if ($verbosity > 1);
     open(TXT, "<$phind_dir/clauses");
     open(NUM, ">$phind_dir/clauses.numbers");
+    print $out "Saving text as numbers in $phinddir/clauses.numbers\n" if ($verbosity > 1);
+    open(TXT, "<$phinddir/clauses");
+    open(NUM, ">$phinddir/clauses.numbers");
     $phrasedelimiter = $symbol{lc($senlimit)};
 …
     print NUM "$symbol{lc($colend)}\n";
+    close NUM;
+    # Save thesaurus  data in one convienient file
+    if ($thesaurus) {
+    my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
+    print $out "Saving thesaurus as numbers in $thesaurusfile\n"
+        if ($verbosity > 1);
+    # Read the thesaurus terms
+    my ($num, $text, %thes_symbols);
+    open(TH, "<$thesaurus_terms");
+    while(<TH>) {
+        chomp;
+        @words = split(/\s+/, $_);
+        $num = shift @words;
+        $text = "";
+        # translate words into symbol numbers
+        foreach $word (@words) {
+        $word = lc($word);
+        if ($symbol{$word}) {
+            $text .= "s$symbol{$word} ";
+        } elsif ($verbosity) {
+            print $out "phind: No thesaurus symbol, ignoring \"$word\"\n";
+        }
+        }
+        $text =~ s/ $//;
+        $thes_symbols{$num} = $text;
+    }
+    close TH;
+    # Read the thesaurus links and write the corresponding data
+    open(TH, "<$thesaurus_links");
+    open(THOUT, ">$thesaurusfile");
+    while(<TH>) {
+        chomp;
+        ($num, $text) = split(/:/, $_);
+        if (defined($thes_symbols{$num})) {
+        print THOUT "$num:$thes_symbols{$num}:$text\n";
+        } else {
+        print THOUT "$num:untranslated:$text\n";
+        }
+    }
+    close TH;
+    close THOUT;
+    }
+}
+# Prepare the phrases file to be input to mgpp.
+# This means renumbering the phrases in order of decreasing frequency.
+# This is legacy code, and a little ugly, and may be unix-specific
+# (particularly the sort command).
+# renumber_phrases
+#
+# Prepare the phrases file to be input to mgpp.  The biggest problem is
+# reconciling the phrase identifiers used by the suffix program (which
+# we'll call suffix-id numbers) with the numbers used in the thesaurus
+# (theesaurus-id) to create a ciommon set of phind id numbers (phind-id).
+# Phind-id numbers must be sorted by frequency of occurance.
+#
+# Start creating a set of phind-id numbers from the sorted suffix-id
+# numbers and (if required) the thesaurus-id numbers.  Then add any other
+# phrases occuring in the thesaurus.
+#
+# The last thing we have to do is restore the vocabulary information to the
+# phrase file so that the phrases are stored as words, not as symbol
+# numbers.
+# The original phrases file looks something like this:
+#   159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
+#   159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
+#   159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
+#   159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
 sub renumber_phrases {
+    my $self = shift (@_);
+    my ($self) = @_;
+    renumber_suffix_data($self);
+    renumber_thesaurus_data($self);
+    restore_vocabulary_data($self);
+}
+# renumber_suffix_data
+#
+# Translate phrases file to phrases.2 using phind keys instead
+# of suffix keys and sorting the expansion data.
+sub renumber_suffix_data {
+    my ($self) = @_;
     my $verbosity = $self->{'verbosity'};
+    my $phind_dir = $self->{'phinddir'};
+    my $savephrases = 0;
+    $savephrases = $self->{'savephrases'} if (defined($self->{'savephrases'}));
+    # Sort the phrases into order of increasing frequency
+    # This means the expansions will be sorted correctly later on.
+    print "Sorting phrases into freq order\n" if ($verbosity);
+    system("sort -rnt ':' +2 -o $phind_dir/phrases $phind_dir/phrases");
+    # Read the vocabulary
+    my $out = $self->{'out'};
+    print $out "Translate phrases: suffix-ids become phind-id's\n"
+    if ($verbosity);
+    my $phinddir = $self->{'phinddir'};
+    my $infile = &util::filename_cat($phinddir, 'phrases');
+    my $outfile = &util::filename_cat($phinddir, 'phrases.2');
+    # Read the phrase file.  Calculate initial set of phind-id
+    # numbers and store (suffixid -> frequency) relation.
+    my %suffixtophind;
+    my @phindfrequency;
+    my (@fields, $suffixid);
+    my $nextphind = 1;
+    open(IN, "<$infile");
+    while(<IN>) {
+    chomp;
+    @fields = split(/:/, $_);
+    # get next suffixid and phindid
+    $suffixid = shift @fields;
+    $suffixtophind{$suffixid} = $nextphind;
+    # store total frequency
+    shift @fields;
+    $totalfrequency[$nextphind] = shift @fields;
+    $nextphind++;
+    }
+    close IN;
+    # Translate phrases file to phrases.2.  Use phind keys (not suffix
+    # keys), sort expansion and document occurance data in order of
+    # descending frequency..
+    open(IN, "<$infile");
+    open(OUT, ">$outfile");
+    my ($phindid, $text, $tf, $countexp, $expansions, $countdocs, $documents);
+    my (@documwents, @newexp, $k, $n);
+    my $linenumber = 0;
+    while(<IN>) {
+    # read the line
+    chomp;
+    @fields = split(/:/, $_);
+    # get a phrase number for this line
+    $suffixid = shift @fields;
+    die unless (defined($suffixtophind{$suffixid}));
+    $phindid = $suffixtophind{$suffixid};
+    # get the symbols in the phrase
+    $text = shift @fields;
+    # output status information
+    $linenumber++;
+    if ($verbosity > 2) {
+        if ($linenumber % 1000 == 0) {
+        print $out "line $linenumber:\t$phindid\t$suffixid\t($text)\n";
+        }
+        print $out "$num: $key\t($text)\n" if ($verbosity > 3);
+    }
+    # get the phrase frequency
+    $tf = shift @fields;
+    # get the number of expansions
+    $countexp = shift @fields;
+    # get the expansions, convert them into phind-id numbers, and sort them
+    $expansions = shift @fields;
+    @newexp = ();
+    foreach $k (split(/,/, $expansions)) {
+        die "ERROR - no phindid for: $k" unless (defined($suffixtophind{$k}));
+        $n = $suffixtophind{$k};
+        push @newexp, $n;
+    }
+    @newexp = sort {$totalfrequency[$b] <=> $totalfrequency[$a]} @newexp;
+    # get the number of documents
+    $countdocs = shift @fields;
+    # get the documents and sort them
+    $documents = shift @fields;
+    $documents =~ s/d//g;
+    @documents = split(/;/, $documents);
+    @documents = sort by_doc_frequency @documents;
+    # output the phrase data
+    print OUT "$phindid:$text:$tf:$countexp:$countdocs:";
+    print OUT join(",", @newexp), ",:", join(";", @documents), ";\n";
+    }
+    close IN;
+    close OUT;
+}
+# renumber_thesaurus_data
+#
+# Translate phrases.2 to phrases.3, adding thesaurus data if available.
+sub renumber_thesaurus_data {
+    my ($self) = @_;
+    my $out = $self->{'out'};
+    my $verbosity = $self->{'verbosity'};
+    my $thesaurus = $self->{'thesaurus'};
+    my $phinddir = $self->{'phinddir'};
+    my $infile = &util::filename_cat($phinddir, "phrases.2");
+    my $outfile = &util::filename_cat($phinddir, "phrases.3");
+    # If no thesaurus is defined, simply move the phrases file.
+    if (!$thesaurus) {
+    print $out "Translate phrases.2: no thesaurus data\n"
+        if ($verbosity);
+    &util::mv($infile, $outfile);
+    return;
+    }
+    print $out "Translate phrases.2: add thesaurus data\n"
+    if ($verbosity);
+    # 1.
+    # Read thesaurus file and store (symbols->thesaurusid) mapping
+    my $thesaurusfile = &util::filename_cat($phinddir, "$thesaurus.numbers");
+    my %symbolstothesid;
+    my (@fields, $thesid, $symbols);
+    open(TH, "<$thesaurusfile");
+    while (<TH>) {
+    chomp;
+    @fields = split(/:/, $_);
+    # get id and text
+    $thesid = shift @fields;
+    $symbols = shift @fields;
+    $symbolstothesid{$symbols} = $thesid;
+    }
+    close TH;
+    # 2.
+    # Read phrases file and note all thesaurus entries that already
+    # have a phindid
+    my %thesaurustophindid;
+    my ($phindid);
+    open(IN, "<$infile");
+    while(<IN>) {
+    chomp;
+    @fields = split(/:/, $_);
+    # phindid and symbols for this line
+    $phindid = shift @fields;
+    $symbols = shift @fields;
+    # do we have a thesaurus id corresponding to this phrase?
+    if (defined($symbolstothesid{$symbols})) {
+        $thesid = $symbolstothesid{$symbols};
+        $thesaurustophindid{$thesid} = $phindid;
+    }
+    }
+    close IN;
+    undef %symbolstothesid;
+    # 3.
+    # Create phind-id numbers for remaining thesaurus entries
+    my $nextphindid = $phindid + 1;
+    open(TH, "<$thesaurusfile");
+    while(<TH>) {
+    chomp;
+    @fields = split(/:/, $_);
+    # read thesaurus-id and ensure it has a corresponding phind-id
+    $thesid = shift @fields;
+    if (!defined($thesaurustophindid{$thesid})) {
+        $thesaurustophindid{$thesid} = $nextphindid;
+        $nextphindid++;
+    }
+    }
+    close TH;
+    # 4.
+    # Translate thesaurus file, replacing thesaurus-id numbers with
+    # phind-id numbers.
+    my $newthesaurusfile = &util::filename_cat($phinddir, "$thesaurus.phindid");
+    my ($relations, $linkcounter, $linktext, $linktype, @linkdata, $link);
+    open(TH, "<$thesaurusfile");
+    open(TO, ">$newthesaurusfile");
+    while(<TH>) {
+    chomp;
+    @fields = split(/:/, $_);
+    # phindid and symbols for this line
+    ($thesid, $symbols, $relations) = @fields;
+    die unless ($thesid && $symbols);
+    die unless $thesaurustophindid{$thesid};
+    $phindid = $thesaurustophindid{$thesid};
+    # convert each part of the relation string to use phind-id numbers
+    $newrelation = "";
+    $linkcounter = 0;
+    foreach $linktext (split(/;/, $relations)) {
+        @linkdata = split(/,/, $linktext);
+        # remember the linktype (e.g. BT, NT)
+        $linktype = shift @linkdata;
+        $newrelation .= "$linktype,";
+        # convert the link target identfiers
+        foreach $link (@linkdata) {
+        die unless (defined($thesaurustophindid{$link}));
+        $newrelation .= "$thesaurustophindid{$link},";
+        $linkcounter++;
+        }
+        $newrelation =~ s/\,$//;
+        $newrelation .= ";";
+    }
+    $newrelation .= ":";
+    print TO "$phindid:$symbols:$linkcounter:$newrelation\n";
+    }
+    close TH;
+    close TO;
+    undef %thesaurustophindid;
+    # 5.
+    # Read thesaurus data (in phind-id format) into memory
+    my %thesaurusdata;
+    open(TH, "<$newthesaurusfile");
+    while(<TH>) {
+    chomp;
+    ($phindid, $symbols, $linkcounter, $relations) = split(/:/, $_);
+    die unless ($phindid && $symbols);
+    $thesaurusdata{$phindid} = "$symbols:$linkcounter:$relations";
+    }
+    # 6.
+    # Add thesaurus data to phrases file
+    my ($text, $tf, $countexp, $expansions, $countdocs, $documents);
+    my (@documwents, @newexp, $k, $n);
+    my $linenumber = 0;
+    open(IN, "<$infile");
+    open(OUT, ">$outfile");
+    # Update existing phrases
+    while(<IN>) {
+    chomp;
+    @fields = split(/:/, $_);
+    # get data for this line
+    $phindid = shift @fields;
+    # output the phrase data, with thesaurus information
+    print OUT "$phindid:", join(":", @fields);
+    # add thesaurus data
+    if (defined($thesaurusdata{$phindid})) {
+        @fields = split(/:/, $thesaurusdata{$phindid});
+        shift @fields;
+        $linkcounter = shift @fields;
+        $relations = shift @fields;
+        print OUT ":$linkcounter:$relations";
+        $thesaurusdata{$phindid} = "";
+    }
+    print OUT "\n";
+    }
+    close IN;
+    # Add phrases that aren't already in the file
+    foreach $phindid (sort numerically keys %thesaurusdata) {
+    next unless ($thesaurusdata{$phindid});
+    @fields = split(/:/, $thesaurusdata{$phindid});
+    $symbols = shift @fields;
+    $linkcounter = shift @fields;
+    $relations = shift @fields;
+    print OUT "$phindid:$symbols:0:0:0:::$linkcounter:$relations\n";
+    }
+    close OUT;
+}
+# restore_vocabulary_data
+#
+# Read phrases.3 and restore vocabulary information. Then write
+# this data to the MGPP input files (pwrod.txt and pdata.txt) and
+# (if requested) to the saved phrases file.
+sub restore_vocabulary_data {
+    my ($self) = @_;
+    my $out = $self->{'out'};
+    my $verbosity = $self->{'verbosity'};
+    print $out "Translate phrases.3: restore vocabulary\n" if ($verbosity);
+    my $phinddir = $self->{'phinddir'};
+    my $infile = &util::filename_cat($phinddir, 'phrases.3');
+    my $vocabfile = &util::filename_cat($phinddir, 'clauses.vocab');
+    my $datafile = &util::filename_cat($phinddir, 'pdata.txt');
+    my $wordfile = &util::filename_cat($phinddir, 'pword.txt');
+    my $savephrases = $self->{'savephrases'};
+    # 1.
+    # Read the vocabulary file
+    open(V, "<$vocabfile")
+    || die "Cannot open $vocabfile: $!";
     my @symbol;
-    print "Reading the vocabulary\n" if ($verbosity);
-    open(V, "<$phind_dir/clauses.vocab")
-    || die "Cannot open $phind_dir/clauses.vocab: $!";
     my $i = 1;
     while(<V>) {
 …
+    }
+    # Create file for phrase data
+    #
+    # The phrases file looks something like this
+    #  159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254
+    #  159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380
+    #  159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374
+    #  159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361
+    # The first field on each line is a unique phrase identifier.
+    # We need to calculate phrase numbers for each phrase
+    print "Calculate phrase numbers\n" if ($verbosity);
+    my %phrasenumber;
+    my $nextphrase = 1;
+    my ($line);
+    open(IN, "<$phind_dir/phrases");
+    while(<IN>) {
+    # read the line
+    chomp;
+    $line = $_;
+    # we're only interested in the first field
+    $line =~ s/:.*//;
+    # get a phrase number for this line
+    $phrasenumber{$line} = $nextphrase;
+    $nextphrase++;
+    }
+    # Now we create a new phrase file using phrase numbers, not the old IDs.
+    print "Format phrase data for MGPP\n" if ($verbosity);
+    # Open the basic files
+    open(IN, "<$phind_dir/phrases");
+    open(DATA, ">$phind_dir/pdata.txt");
+    open(IDX, ">$phind_dir/pword.txt");
+    # We may want to save the phrases in a separate text file
+    # 2.
+    # Translate phrases.3 to MGPP input files
+    my ($key, $text, $word);
+    my @fields;
+    my $linenumber = 0;
+    open(IN, "<$infile");
+    open(DATA, ">$datafile");
+    open(WORD, ">$wordfile");
+    # Save the phrases in a separate text file
     if ($savephrases) {
     print "Saving phrases in $savephrases\n" if ($verbosity);
+    print $out "Saving phrases in $savephrases\n" if ($verbosity);
     open(SAVE, ">$savephrases");
+    }
-    my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word);
-    my @fields;
-    my @documents;
-    my (@newexp, $k, $n);
-    my $linenumber = 0;
     while(<IN>) {
 …
     # get a phrase number for this line
     $key = shift @fields;
+    die unless (defined($phrasenumber{$key}));
+    $num = $phrasenumber{$key};
+    # get the text of the phrase
+    # restore the text of the phrase
     $text = shift @fields;
     $text =~ s/s(\d+)/$symbol[$1]/g;
     if ($text =~ / /) {
         $word = "";
     } else {
+    } elsif ($text ne 'untranslated') {
         $word = $text;
+    }
-    $linenumber++;
-    if ($linenumber % 1000 == 0) {
-        print "line $linenumber:\t$num\t$key\t($text)\n"  if ($verbosity > 2);
+    }
-    print "$num: $key\t($text)\n" if ($verbosity > 3);
-    # get the phrase frequency
-    $tf = shift @fields;
-    # get the number of expansions
-    $countexp = shift @fields;
-    # get the expansions and convert them into phrase numbers
-    $expansions = shift @fields;
-    @newexp = ();
-    foreach $k (split(/,/, $expansions)) {
-        die "ERROR - no phrase number for: $k" unless (defined($phrasenumber{$k}));
-        $n = $phrasenumber{$k};
-        push @newexp, $n;
+    }
-    @newexp = sort numerically @newexp;
-    # get the number of documents
-    $countdocs = shift @fields;
-    # get the documents
-    $documents = shift @fields;
-    $documents =~ s/d//g;
-    @documents = split(/;/, $documents);
-    @documents = sort by_frequency @documents;
     # output the phrase data
     print DATA "<Document>";
+    print DATA "$num:$text:$tf:$countexp:$countdocs:";
+    print DATA join(",", @newexp), ":", join(";", @documents), "\n";
+    print DATA "$key:$text:", join(":", @fields), ":\n";
     # output the word index search data
     print IDX "<Document>$word\n";
+    print WORD "<Document>$word\n";
     # output the phrases to a text file
 …
         print SAVE "$tf\t$countdocs\t$text\n";
+    }
+    }
+    }
+    close IN;
+    close WORD;
+    close DATA;
     close SAVE if ($savephrases);
+}
 # sort routines used to renumber phrases
 sub numerically { $a <=> $b }
 sub by_frequency {
+sub by_doc_frequency {
     my $fa = 1;
     if ($a =~ /,/) {
 …
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1829

Legend:

trunk/gsdl/perllib/classify/phind.pm

Download in other formats: