Context Navigation

← Previous Change
Next Change →

Changeset 1591 for trunk/gsdl/src

Timestamp:

2000-10-06T13:32:32+13:00 (24 years ago)

Author:

paynter

Message:

Having a thesurus is now optional.

File:

: 1 edited

trunk/gsdl/src/phind/generate/phindgen.pl (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/src/phind/generate/phindgen.pl

-              r1574
+              r1591
 # digital library collection.
+#
+# It reads the GML files that have been imported to the archives directory,
+# and then creates the phind indexes in the phindex directory.
+# The GML files that have been imported to the archives directory are
+# read, and then the phind indexes are created in the phindex directory.
+#
+# This version od Phind uses suffix arrays to extract phrases.
 …
     # Generate the vocabulary, symbol statistics, and numbers file
     # from the clauses file
     print "\nExtracting vocaulary and statistics\n" if $verbosity;
+    print "\nExtracting vocabulary and statistics\n" if $verbosity;
     &extract_vocabulary($phindexdir, $language, $verbosity);
 …
     my ($first_delimiter, $last_delimiter,
     $first_stopword, $last_stopword,
-    $first_thesaurusword, $last_thesaurusword,
     $first_extractword, $last_extractword,
     $first_contentword, $last_contentword,
     $phrasedelimiter);
+    my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
     my %symbol;
-    my %thesaurus;
     my (%freq);
 …
     # Read and store the thesaurus terms
+    $use_thesaurus = 0;
     my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
                        "$language", "agrovoc.lex");
+    die unless (-e "$lex_file");
+    open(TH, "<$lex_file");
+    while(<TH>) {
+    s/^\d+ //;
+    s/\(.*\)//;
+    foreach my $w (split(/\s+/, $_)) {
+        $thesaurus{lc($w)} = $w;
+    }
+    }
+    close TH;
+    if (-e "$lex_file") {
+    open(TH, "<$lex_file");
+    while(<TH>) {
+        s/^\d+ //;
+        s/\(.*\)//;
+        foreach my $w (split(/\s+/, $_)) {
+        $thesaurus{lc($w)} = $w;
+        }
+    }
+    close TH;
+    $use_thesaurus = 1;
+    }
     # Read words in the text and count occurences
 …
     undef %bestfreq;
     # Assign symbol numbers to tokens
     my $nextsymbol = 1;
 …
     foreach $word (@delimiters) {
     $word = lc($word);
     $bestform{$word} = uc($word);
 …
+    }
     $last_stopword = $nextsymbol - 1;
+    $first_contentword = $nextsymbol;
     # Thesaurus terms
+    $first_thesaurusword = $nextsymbol;
+    $first_contentword = $nextsymbol;
+    foreach my $word (sort keys %thesaurus) {
+    $word = lc($word);
+    next if ($symbol{$word});
+    $bestform{$word} = $thesaurus{$word};
+    $vocab[$nextsymbol] = $word;
+    $symbol{$word} = $nextsymbol;
+    $nextsymbol++;
+    }
+    $last_thesaurusword = $nextsymbol - 1;
+    if ($use_thesaurus) {
+    $first_thesaurusword = $nextsymbol;
+    foreach my $word (sort keys %thesaurus) {
+        $word = lc($word);
+        next if ($symbol{$word});
+        $bestform{$word} = $thesaurus{$word};
+        $vocab[$nextsymbol] = $word;
+        $symbol{$word} = $nextsymbol;
+        $nextsymbol++;
+    }
+    $last_thesaurusword = $nextsymbol - 1;
+    }
     # Other content words
     $first_extractword = $nextsymbol;
 …
     print STAT "first_stopword $first_stopword\n";
     print STAT "last_stopword $last_stopword\n";
+    print STAT "first_thesaurusword $first_thesaurusword\n";
+    print STAT "last_thesaurusword $last_thesaurusword\n";
+    if ($use_thesaurus) {
+    print STAT "first_thesaurusword $first_thesaurusword\n";
+    print STAT "last_thesaurusword $last_thesaurusword\n";
+    }
     print STAT "first_extractword $first_extractword\n";
     print STAT "last_extractword $last_extractword\n";

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1591 for trunk/gsdl/src

Legend:

trunk/gsdl/src/phind/generate/phindgen.pl

Download in other formats: