Changeset 1591 for trunk/gsdl/src


Ignore:
Timestamp:
2000-10-06T13:32:32+13:00 (24 years ago)
Author:
paynter
Message:

Having a thesurus is now optional.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/phindgen.pl

    r1574 r1591  
    3131# digital library collection. 
    3232#
    33 # It reads the GML files that have been imported to the archives directory,
    34 # and then creates the phind indexes in the phindex directory.
     33# The GML files that have been imported to the archives directory are
     34# read, and then the phind indexes are created in the phindex directory.
     35#
     36# This version od Phind uses suffix arrays to extract phrases.
    3537
    3638
     
    146148    # Generate the vocabulary, symbol statistics, and numbers file
    147149    # from the clauses file
    148     print "\nExtracting vocaulary and statistics\n" if $verbosity;
     150    print "\nExtracting vocabulary and statistics\n" if $verbosity;
    149151    &extract_vocabulary($phindexdir, $language, $verbosity);
    150152
     
    245247    my ($first_delimiter, $last_delimiter,
    246248    $first_stopword, $last_stopword,
    247     $first_thesaurusword, $last_thesaurusword,
    248249    $first_extractword, $last_extractword,
    249250    $first_contentword, $last_contentword,
    250251    $phrasedelimiter);
    251252
     253    my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword);
     254
     255
    252256    my %symbol;
    253     my %thesaurus;
    254 
    255257    my (%freq);
    256258
     
    266268   
    267269    # Read and store the thesaurus terms
     270    $use_thesaurus = 0;
    268271    my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind",
    269272                       "$language", "agrovoc.lex");
    270     die unless (-e "$lex_file");
    271     open(TH, "<$lex_file");
    272     while(<TH>) {
    273     s/^\d+ //;
    274     s/\(.*\)//;
    275     foreach my $w (split(/\s+/, $_)) {
    276         $thesaurus{lc($w)} = $w;
    277     }
    278     }
    279     close TH;
     273    if (-e "$lex_file") {
     274    open(TH, "<$lex_file");
     275    while(<TH>) {
     276        s/^\d+ //;
     277        s/\(.*\)//;
     278        foreach my $w (split(/\s+/, $_)) {
     279        $thesaurus{lc($w)} = $w;
     280        }
     281    }
     282    close TH;
     283    $use_thesaurus = 1;
     284    }
    280285
    281286    # Read words in the text and count occurences
     
    320325    undef %bestfreq;
    321326   
     327
    322328    # Assign symbol numbers to tokens
    323329    my $nextsymbol = 1;
     
    328334   
    329335    foreach $word (@delimiters) {
    330    
     336
    331337    $word = lc($word);
    332338    $bestform{$word} = uc($word);
     
    352358    }
    353359    $last_stopword = $nextsymbol - 1;
     360    $first_contentword = $nextsymbol;
    354361   
    355362    # Thesaurus terms
    356     $first_thesaurusword = $nextsymbol;
    357     $first_contentword = $nextsymbol;
    358    
    359     foreach my $word (sort keys %thesaurus) {
    360    
    361     $word = lc($word);
    362     next if ($symbol{$word});
    363     $bestform{$word} = $thesaurus{$word};
    364    
    365     $vocab[$nextsymbol] = $word;
    366     $symbol{$word} = $nextsymbol;
    367     $nextsymbol++;
    368    
    369     }
    370     $last_thesaurusword = $nextsymbol - 1;
    371    
     363    if ($use_thesaurus) {
     364    $first_thesaurusword = $nextsymbol;
     365   
     366    foreach my $word (sort keys %thesaurus) {
     367       
     368        $word = lc($word);
     369        next if ($symbol{$word});
     370        $bestform{$word} = $thesaurus{$word};
     371       
     372        $vocab[$nextsymbol] = $word;
     373        $symbol{$word} = $nextsymbol;
     374        $nextsymbol++;
     375       
     376    }
     377    $last_thesaurusword = $nextsymbol - 1;
     378    }
     379
    372380    # Other content words
    373381    $first_extractword = $nextsymbol;
     
    408416    print STAT "first_stopword $first_stopword\n";
    409417    print STAT "last_stopword $last_stopword\n";
    410     print STAT "first_thesaurusword $first_thesaurusword\n";
    411     print STAT "last_thesaurusword $last_thesaurusword\n";
     418    if ($use_thesaurus) {
     419    print STAT "first_thesaurusword $first_thesaurusword\n";
     420    print STAT "last_thesaurusword $last_thesaurusword\n";
     421    }
    412422    print STAT "first_extractword $first_extractword\n";
    413423    print STAT "last_extractword $last_extractword\n";
Note: See TracChangeset for help on using the changeset viewer.