Changeset 1883 for trunk/gsdl/perllib
- Timestamp:
- 2001-01-31T14:10:26+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r1871 r1883 30 30 # 31 31 # options are: 32 # button=Name The label for the classifiers button in the32 # -button Name The label for the classifiers button in the 33 33 # navigation bar (defaults to "Phrase"). 34 34 # -title Title The metadata field used to describe each document … … 40 40 # -untidy Do not clean up intermediate files 41 41 # -suffixmode num Mode of suffix program (0 = all phrases, 1 = stopword) 42 # -suffixsize num Number of symbols available to suffix program43 42 # -savephrases filename If set, phrase infomation will be stored in filename 44 43 # as text. (By defualt, it is not set.) … … 91 90 92 91 options: 93 -title Title to use on web pages 94 -text 95 -title 96 -button 97 -language 98 -savephrases 99 -suffixsize 100 -suffixmode 101 -thesaurus 102 -untidy 103 "; 104 } 92 -text Fields The text used to build the phrase hierarchy. 93 (default: 'section:Title,section:text') 94 95 -title Title The metadata field used to describe each document. 96 (default: 'Title') 97 98 -button Name The label for the classifier screen and button in 99 navigation bar. 100 (default: 'Phrase') 101 102 -language Regex Language or languages to use building hierarchy. 103 Languages are identified by two-letter country codes 104 like en (English), es (Spanish), and fr (French). 105 Language is a regular expression, so 'en|fr' (English or 106 French) and '..' (match any language) are valid. 107 (default: 'en'.) 108 109 -savephrases File If set, the phrase infomation will be stored in 110 the given file as text. It is probably a good idea 111 to use an absolute path. 112 (defualt: not set) 113 114 -suffixmode N The smode parameter to the phrase extraction program. A 115 value of 0 means that stopwords are ignored, and of 1 116 means that stopwords are used. 117 (default: 1) 118 119 -thesaurus Name Name of a thesaurus stored in phind format in the 120 collection's etc directory. 121 (default: not set) 122 123 -untidy Don't remove working files. 124 125 "; } 105 126 106 127 # Create a new phind browser based on collect.cfg … … 174 195 q^builddir/.*/^, \$builddir, 175 196 q^savephrases/\d/0^, \$self->{'savephrases'}, 176 q^suffixsize/\d+/100000^, \$self->{'suffixsize'},177 197 q^suffixmode/\d/1^, \$self->{'suffixmode'}, 178 198 q^thesaurus/.*/^, \$self->{'thesaurus'}, … … 250 270 print "process: $title\n" if ($verbosity > 2); 251 271 252 # only consider english-language files272 # Only consider the file if it is in the correct language 253 273 my $doclanguage = $doc_obj->get_metadata_element ($top_section, "Language"); 254 274 my $phrlanguage = $self->{'language_exp'}; … … 340 360 my $verbosity = $self->{'verbosity'}; 341 361 my $out = $self->{'outhandle'}; 342 343 362 my $phinddir = $self->{'phinddir'}; 344 my $language = "english"; 345 363 346 364 if ($verbosity) { 347 365 print $out "\n*** phind.pm generating indexes for ", $self->{'indexes'}, "\n"; … … 350 368 # Construct phind indexes 351 369 my $suffixmode = $self->{'suffixmode'}; 352 my $suffixsize = $self->{'suffixsize'};353 370 my ($command, $status); 354 371 … … 360 377 # Use the suffix program to generate the phind/phrases file 361 378 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity; 362 &execute("suffix $phinddir $suffix size $suffixmode", $verbosity, $out);379 &execute("suffix $phinddir $suffixmode $verbosity", $verbosity, $out); 363 380 364 381 # Create the phrase file and put phrase numbers in phind/phrases … … 552 569 my $out = $self->{'outhandle'}; 553 570 554 my $language = "english"; # $self->{'language'};555 556 571 my $collectiondir = $self->{'collectiondir'}; 557 558 572 my $phinddir = $self->{'phinddir'}; 573 574 my $language_exp = $self->{'language_exp'}; 559 575 560 576 my ($w, $l, $line, $word); … … 576 592 577 593 # Read and store the stopwords 578 my $words = `find $ENV{'GSDLHOME'}/etc/phind/$language -name "*.sw" | xargs cat`; 594 my $stopdir = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "stopwords"); 595 my $stopword_files = (); 596 my ($language, $language_dir, $file, $file_name); 579 597 my %stopwords; 580 foreach $w (split(/\s+/, $words)) { 581 $l = lc($w); 582 $stopwords{$l} = $w; 583 } 584 598 599 # Examine each directory in the stopword directory 600 opendir(STOPDIR, $stopdir); 601 foreach $language (readdir STOPDIR) { 602 603 # Ignore entries that do not match the classifier's language 604 next unless ($language =~ /$language_exp/); 605 $language_dir = &util::filename_cat($stopdir, $language); 606 next unless (-d "$language_dir"); 607 608 opendir(LANGDIR, $language_dir); 609 foreach $file (readdir LANGDIR) { 610 611 # Ignore entries that are not stopword files 612 next unless ($file =~ /sw$/); 613 $file_name = &util::filename_cat($language_dir, $file); 614 next unless (-f "$file_name"); 615 616 # Read the stopwords 617 open(STOPFILE, "<$file_name"); 618 while (<STOPFILE>) { 619 s/^\s+//; 620 s/\s.*//; 621 $word = $_; 622 $l = lc($word); 623 $stopwords{$l} = $word; 624 } 625 close STOPFILE; 626 627 } 628 } 629 585 630 # Read thesaurus information 586 631 if ($thesaurus) {
Note:
See TracChangeset
for help on using the changeset viewer.