Changeset 1591 for trunk/gsdl/src
- Timestamp:
- 2000-10-06T13:32:32+13:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/phind/generate/phindgen.pl
r1574 r1591 31 31 # digital library collection. 32 32 # 33 # It reads the GML files that have been imported to the archives directory, 34 # and then creates the phind indexes in the phindex directory. 33 # The GML files that have been imported to the archives directory are 34 # read, and then the phind indexes are created in the phindex directory. 35 # 36 # This version od Phind uses suffix arrays to extract phrases. 35 37 36 38 … … 146 148 # Generate the vocabulary, symbol statistics, and numbers file 147 149 # from the clauses file 148 print "\nExtracting voca ulary and statistics\n" if $verbosity;150 print "\nExtracting vocabulary and statistics\n" if $verbosity; 149 151 &extract_vocabulary($phindexdir, $language, $verbosity); 150 152 … … 245 247 my ($first_delimiter, $last_delimiter, 246 248 $first_stopword, $last_stopword, 247 $first_thesaurusword, $last_thesaurusword,248 249 $first_extractword, $last_extractword, 249 250 $first_contentword, $last_contentword, 250 251 $phrasedelimiter); 251 252 253 my ($use_thesaurus, %thesaurus, $first_thesaurusword, $last_thesaurusword); 254 255 252 256 my %symbol; 253 my %thesaurus;254 255 257 my (%freq); 256 258 … … 266 268 267 269 # Read and store the thesaurus terms 270 $use_thesaurus = 0; 268 271 my $lex_file = &util::filename_cat("$ENV{'GSDLHOME'}", "etc", "phind", 269 272 "$language", "agrovoc.lex"); 270 die unless (-e "$lex_file"); 271 open(TH, "<$lex_file"); 272 while(<TH>) { 273 s/^\d+ //; 274 s/\(.*\)//; 275 foreach my $w (split(/\s+/, $_)) { 276 $thesaurus{lc($w)} = $w; 277 } 278 } 279 close TH; 273 if (-e "$lex_file") { 274 open(TH, "<$lex_file"); 275 while(<TH>) { 276 s/^\d+ //; 277 s/\(.*\)//; 278 foreach my $w (split(/\s+/, $_)) { 279 $thesaurus{lc($w)} = $w; 280 } 281 } 282 close TH; 283 $use_thesaurus = 1; 284 } 280 285 281 286 # Read words in the text and count occurences … … 320 325 undef %bestfreq; 321 326 327 322 328 # Assign symbol numbers to tokens 323 329 my $nextsymbol = 1; … … 328 334 329 335 foreach $word (@delimiters) { 330 336 331 337 $word = lc($word); 332 338 $bestform{$word} = uc($word); … … 352 358 } 353 359 $last_stopword = $nextsymbol - 1; 360 $first_contentword = $nextsymbol; 354 361 355 362 # Thesaurus terms 356 $first_thesaurusword = $nextsymbol; 357 $first_contentword = $nextsymbol; 358 359 foreach my $word (sort keys %thesaurus) { 360 361 $word = lc($word); 362 next if ($symbol{$word}); 363 $bestform{$word} = $thesaurus{$word}; 364 365 $vocab[$nextsymbol] = $word; 366 $symbol{$word} = $nextsymbol; 367 $nextsymbol++; 368 369 } 370 $last_thesaurusword = $nextsymbol - 1; 371 363 if ($use_thesaurus) { 364 $first_thesaurusword = $nextsymbol; 365 366 foreach my $word (sort keys %thesaurus) { 367 368 $word = lc($word); 369 next if ($symbol{$word}); 370 $bestform{$word} = $thesaurus{$word}; 371 372 $vocab[$nextsymbol] = $word; 373 $symbol{$word} = $nextsymbol; 374 $nextsymbol++; 375 376 } 377 $last_thesaurusword = $nextsymbol - 1; 378 } 379 372 380 # Other content words 373 381 $first_extractword = $nextsymbol; … … 408 416 print STAT "first_stopword $first_stopword\n"; 409 417 print STAT "last_stopword $last_stopword\n"; 410 print STAT "first_thesaurusword $first_thesaurusword\n"; 411 print STAT "last_thesaurusword $last_thesaurusword\n"; 418 if ($use_thesaurus) { 419 print STAT "first_thesaurusword $first_thesaurusword\n"; 420 print STAT "last_thesaurusword $last_thesaurusword\n"; 421 } 412 422 print STAT "first_extractword $first_extractword\n"; 413 423 print STAT "last_extractword $last_extractword\n";
Note:
See TracChangeset
for help on using the changeset viewer.