Changeset 1803
- Timestamp:
- 2000-12-18T14:18:19+13:00 (23 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/phind.pm
r1646 r1803 30 30 # 31 31 # options are: 32 # title=Title The title field for this classification 32 # button=Name The label for the classifiers button in the 33 # navigation bar (defaults to "Topic"). 34 # title=Title The metadata field used to describe each document 35 # (defaults to "Title"). 33 36 # text=fields The text used to build the phrase hierarchy 34 # phindexdir=directory Location of phind index files 37 # (defaults to "section:Title,section:text"). 38 # phinddir=directory Location of phind index files 35 39 # verbosity=num Control amount of output 36 40 # untidy=true Do not clean up intermediate files … … 41 45 # How a classifier works. 42 46 # 43 # When a classifier is requested in the collect.cfg file, buildcol creates a 44 # new classifier object (such as the one defined in theis file) and later 45 # passes each document object to the classifier in turn. Four functions are 46 # used: 47 # For each classifier requested in the collect.cfg file, buildcol.pl creates 48 # a new classifier object (such as the one defined in theis file) and later 49 # passes each document object to the classifier in turn for classification. 50 # 51 # Four functions are used: 47 52 # 48 53 # 1. "new" is called before the documents are processed to set up the … … 51 56 # 2. "init" is called after buildcol.pl has created the indexes etc but 52 57 # before the documents are classified in order that the classifier might 53 # set any vari oables it requiers, etc.58 # set any variables it requires, etc. 54 59 # 55 60 # 3. "classify" is called once for each document object. The classifier … … 136 141 exit(1); 137 142 } 138 139 143 140 144 # The installation appears OK - set up the classifier 141 145 my $collection = $ENV{'GSDLCOLLECTION'}; 142 my $phindexdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"phindex");143 146 my $language = "english"; 144 147 145 my $title = "Topic"; 148 my $button = "Phrase"; 149 my $title = "Title"; 146 150 my $indexes = "section:Title,section:text"; 151 152 my $builddir = ""; 153 my $phinddir = ""; 147 154 148 155 my $suffixmode = 1; … … 154 161 # parse the options 155 162 foreach $option (@options) { 163 164 print STDERR "option: $option\n"; 156 165 157 166 if ($option =~ /^text=(.*)$/i) { … … 159 168 } elsif ($option =~ /^title=(.*)$/i) { 160 169 $title = $1; 161 } elsif ($option =~ /^phindexdir=(.*)$/i) { 162 $phindexdir = $1; 170 } elsif ($option =~ /^button=(.*)$/i) { 171 $button = $1; 172 } elsif ($option =~ /^builddir=(.*)$/i) { 173 $builddir = $1; 174 } elsif ($option =~ /^phinddir=(.*)$/i) { 175 $phinddir = $1; 163 176 } elsif ($option =~ /^suffixsize=(.*)$/i) { 164 177 $suffixsize = $1; … … 172 185 } 173 186 174 187 # classifier information 175 188 $self->{'collection'} = $collection; 176 $self->{'title'} = $title; 189 $self->{'titlefield'} = $title; 190 $self->{'buttonname'} = $button; 177 191 $self->{'indexes'} = $indexes; 178 192 193 # phrase extraction options 179 194 $self->{'suffixmode'} = $suffixmode; 180 195 $self->{'suffixsize'} = $suffixsize; 181 182 $self->{'verbosity'} = $verbosity;183 $self->{'untidy'} = $untidy;184 196 185 197 # limit languages … … 188 200 $self->{'delimiter'} = $delimiter; 189 201 190 # reset phindex directory 191 if (-e "$phindexdir") { 192 &util::rm_r("$phindexdir"); 193 } 194 &util::mk_dir("$phindexdir"); 195 $self->{'phindexdir'} = $phindexdir; 202 # build directory 203 if (!$builddir) { 204 $builddir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building"); 205 } 206 $self->{'builddir'} = $builddir; 207 208 # phind directory 209 if (!$phinddir) { 210 $phinddir = &util::filename_cat($builddir, "phind"); 211 } 212 $self->{'phinddir'} = $phinddir; 213 214 # debugging levels 215 $self->{'verbosity'} = $verbosity; 216 $self->{'untidy'} = $untidy; 196 217 197 218 return bless $self, $class; … … 204 225 my $self = shift (@_); 205 226 227 # ensure we have a build directory 228 my $builddir = $self->{'builddir'}; 229 die unless (-e "$builddir"); 230 231 # create phind directory 232 my $phinddir = $self->{'phinddir'}; 233 if (-e "$phinddir") { 234 &util::rm_r("$phinddir"); 235 } 236 &util::mk_dir("$phinddir"); 237 206 238 # open filehandles for documents and text 207 my $phindexdir = $self->{'phindexdir'}; 208 209 my $clausefile = &util::filename_cat("$phindexdir", "clauses"); 239 my $clausefile = &util::filename_cat("$phinddir", "clauses"); 210 240 &util::rm($clausefile) if (-e $clausefile); 211 241 open(TEXT, ">$clausefile") || die "Cannot open $clausefile: $!"; 212 242 $self->{'txthandle'} = TEXT; 213 243 214 my $docfile = &util::filename_cat("$phind exdir", "docs.txt");244 my $docfile = &util::filename_cat("$phinddir", "docs.txt"); 215 245 &util::rm($docfile) if (-e $docfile); 216 246 open(DOCS, ">$docfile") || die "Cannot open $docfile: $!"; … … 232 262 my $top_section = $doc_obj->get_top_section(); 233 263 234 my $title = $doc_obj->get_metadata_element ($top_section, "Title"); 264 my $titlefield = $self->{'titlefield'}; 265 266 my $title = $doc_obj->get_metadata_element ($top_section, $titlefield); 235 267 print "process: $title\n" if ($verbosity > 2); 236 268 … … 316 348 # 317 349 # When get_classify_info is called, the clauses and docs.txt files have 318 # already been constructed in the phind exdirectory. This function will350 # already been constructed in the phind directory. This function will 319 351 # translate them into compressed, indexed MGPP files that can be read by 320 352 # the phindcgi script. It will also register our classifier so that it 321 # shows up in the navigation bar.353 # shows up in the navigation bar. 322 354 323 355 sub get_classify_info { … … 325 357 326 358 my $verbosity = $self->{'verbosity'}; 327 my $phind exdir = $self->{'phindexdir'};359 my $phinddir = $self->{'phinddir'}; 328 360 my $language = "english"; 329 361 … … 340 372 # from the clauses file 341 373 print "\nExtracting vocabulary and statistics\n" if $verbosity; 342 &extract_vocabulary($phind exdir, $language, $verbosity);343 344 # Use the suffix program to generate the phind ex/phrases file374 &extract_vocabulary($phinddir, $language, $verbosity); 375 376 # Use the suffix program to generate the phind/phrases file 345 377 print "\nExtracting phrases from processed text (with suffix)\n" if $verbosity; 346 &execute("suffix $phind exdir $suffixsize $suffixmode", $verbosity);347 348 # Create the phrase file and put phrase numbers in phind ex/phrases378 &execute("suffix $phinddir $suffixsize $suffixmode", $verbosity); 379 380 # Create the phrase file and put phrase numbers in phind/phrases 349 381 print "\nSorting and Renumbering phrases for input to mgpp\n" if $verbosity; 350 &renumber_phrases("$phind exdir", $verbosity);382 &renumber_phrases("$phinddir", $verbosity); 351 383 352 384 # Create the mg phrase database … … 361 393 362 394 print "\nCreating phrase databases\n"; 363 my $mg_input = &util::filename_cat($phind exdir, "pdata.txt");395 my $mg_input = &util::filename_cat($phinddir, "pdata.txt"); 364 396 my $mg_stem = "pdata"; 365 397 366 &execute("$mg_passes -d $phind exdir -f $mg_stem -T1 $mg_input", $verbosity);367 &execute("$mg_compression_dict -d $phind exdir -f $mg_stem", $verbosity);368 &execute("$mg_passes -d $phind exdir -f $mg_stem -T2 $mg_input", $verbosity);398 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity); 399 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity); 400 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity); 369 401 370 402 # create the mg index of words 371 403 print "\nCreating word-level search indexes\n"; 372 $mg_input = &util::filename_cat($phind exdir, "pword.txt");404 $mg_input = &util::filename_cat($phinddir, "pword.txt"); 373 405 $mg_stem = "pword"; 374 406 375 &execute("$mg_passes -d $phind exdir -f $mg_stem -T1 -I1 $mg_input", $verbosity);376 &execute("$mg_compression_dict -d $phind exdir -f $mg_stem", $verbosity);377 &execute("$mg_perf_hash_build -d $phind exdir -f $mg_stem", $verbosity);378 &execute("$mg_passes -d $phind exdir -f $mg_stem -T2 -I2 $mg_input", $verbosity);379 &execute("$mg_weights_build -d $phind exdir -f $mg_stem", $verbosity);380 &execute("$mg_invf_dict -d $phind exdir -f $mg_stem", $verbosity);381 382 &execute("$mg_stem_idx -d $phind exdir -f $mg_stem -s 1", $verbosity);383 &execute("$mg_stem_idx -d $phind exdir -f $mg_stem -s 2", $verbosity);384 &execute("$mg_stem_idx -d $phind exdir -f $mg_stem -s 3", $verbosity);407 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 -I1 $mg_input", $verbosity); 408 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity); 409 &execute("$mg_perf_hash_build -d $phinddir -f $mg_stem", $verbosity); 410 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 -I2 $mg_input", $verbosity); 411 &execute("$mg_weights_build -d $phinddir -f $mg_stem", $verbosity); 412 &execute("$mg_invf_dict -d $phinddir -f $mg_stem", $verbosity); 413 414 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 1", $verbosity); 415 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 2", $verbosity); 416 &execute("$mg_stem_idx -d $phinddir -f $mg_stem -s 3", $verbosity); 385 417 386 418 # create the mg document information database 387 419 print "\nCreating document information databases\n"; 388 $mg_input = &util::filename_cat($phind exdir, "docs.txt");420 $mg_input = &util::filename_cat($phinddir, "docs.txt"); 389 421 $mg_stem = "docs"; 390 422 391 &execute("$mg_passes -d $phind exdir -f $mg_stem -T1 $mg_input", $verbosity);392 &execute("$mg_compression_dict -d $phind exdir -f $mg_stem", $verbosity);393 &execute("$mg_passes -d $phind exdir -f $mg_stem -T2 $mg_input", $verbosity);423 &execute("$mg_passes -d $phinddir -f $mg_stem -T1 $mg_input", $verbosity); 424 &execute("$mg_compression_dict -d $phinddir -f $mg_stem", $verbosity); 425 &execute("$mg_passes -d $phinddir -f $mg_stem -T2 $mg_input", $verbosity); 394 426 395 427 … … 397 429 if (!$untidy) { 398 430 print "\nCleaning up\n" if ($verbosity > 2); 399 &util::rm("$phind exdir/clauses", "$phindexdir/clauses.numbers",400 "$phind exdir/clauses.vocab", "$phindexdir/clauses.stats",401 "$phind exdir/phrases", "$phindexdir/docs.txt",402 "$phind exdir/pdata.txt", "$phindexdir/pword.txt");431 &util::rm("$phinddir/clauses", "$phinddir/clauses.numbers", 432 "$phinddir/clauses.vocab", "$phinddir/clauses.stats", 433 "$phinddir/phrases", "$phinddir/docs.txt", 434 "$phinddir/pdata.txt", "$phinddir/pword.txt"); 403 435 my $outfile = 1; 404 while (-e "$phind exdir/outPhrase.$outfile") {405 &util::rm("$phind exdir/outPhrase.$outfile");436 while (-e "$phinddir/outPhrase.$outfile") { 437 &util::rm("$phinddir/outPhrase.$outfile"); 406 438 $outfile++; 407 439 } … … 415 447 my %classifyinfo = ('thistype'=>'Invisible', 416 448 'childtype'=>'Phind', 417 'Title'=>$self->{' title'},449 'Title'=>$self->{'buttonname'}, 418 450 'contains'=>[]); 419 451 … … 531 563 532 564 sub extract_vocabulary { 533 my ($phind ex_dir, $language, $verbosity) = @_;565 my ($phind_dir, $language, $verbosity) = @_; 534 566 535 567 my ($w, $l, $line, $word); … … 575 607 576 608 # Read words in the text and count occurences 577 open(TXT, "<$phind ex_dir/clauses");609 open(TXT, "<$phind_dir/clauses"); 578 610 my @words; 579 611 … … 684 716 685 717 # Outut the words 686 print "Saving vocabulary in $phind ex_dir/clauses.vocab\n" if ($verbosity > 1);687 open(VOC, ">$phind ex_dir/clauses.vocab");718 print "Saving vocabulary in $phind_dir/clauses.vocab\n" if ($verbosity > 1); 719 open(VOC, ">$phind_dir/clauses.vocab"); 688 720 689 721 for (my $i = 1; $i < $nextsymbol; $i++) { … … 697 729 698 730 # Output statistics about the vocablary 699 print "Saving statistics in $phind ex_dir/clauses.stats\n" if ($verbosity > 1);700 &util::rm("$phind ex_dir/clauses.stats") if (-e "$phindex_dir/clauses.stats");701 open(STAT, ">$phind ex_dir/clauses.stats")702 || die "Cannot open $phind ex_dir/clauses.stats: $!";731 print "Saving statistics in $phind_dir/clauses.stats\n" if ($verbosity > 1); 732 &util::rm("$phind_dir/clauses.stats") if (-e "$phind_dir/clauses.stats"); 733 open(STAT, ">$phind_dir/clauses.stats") 734 || die "Cannot open $phind_dir/clauses.stats: $!"; 703 735 704 736 print STAT "first_delimiter $first_delimiter\n"; … … 724 756 725 757 # Save text as symbol numbers 726 print "Saving text as numbers in $phind ex_dir/clauses.numbers\n" if ($verbosity > 1);727 728 open(TXT, "<$phind ex_dir/clauses");729 open(NUM, ">$phind ex_dir/clauses.numbers");758 print "Saving text as numbers in $phind_dir/clauses.numbers\n" if ($verbosity > 1); 759 760 open(TXT, "<$phind_dir/clauses"); 761 open(NUM, ">$phind_dir/clauses.numbers"); 730 762 731 763 $phrasedelimiter = $symbol{lc($senlimit)}; … … 763 795 764 796 sub renumber_phrases { 765 my ($phind ex_dir, $verbosity) = @_;797 my ($phind_dir, $verbosity) = @_; 766 798 767 799 # Sort the phrases into order of increasing frequency 768 800 # This means the expansions will be sorted correctly later on. 769 801 print "Sorting phrases into freq order\n" if ($verbosity); 770 system("sort -rnt ':' +2 -o $phind ex_dir/phrases $phindex_dir/phrases");802 system("sort -rnt ':' +2 -o $phind_dir/phrases $phind_dir/phrases"); 771 803 772 804 my @symbol; … … 774 806 # Read the vocabulary 775 807 print "Reading the vocabulary\n" if ($verbosity); 776 open(V, "<$phind ex_dir/clauses.vocab")777 || die "Cannot open $phind ex_dir/clauses.vocab: $!";808 open(V, "<$phind_dir/clauses.vocab") 809 || die "Cannot open $phind_dir/clauses.vocab: $!"; 778 810 779 811 my $i = 1; … … 799 831 my ($line); 800 832 801 open(IN, "<$phind ex_dir/phrases");833 open(IN, "<$phind_dir/phrases"); 802 834 while(<IN>) { 803 835 … … 818 850 print "Format phrase data for MGPP\n" if ($verbosity); 819 851 820 open(IN, "<$phind ex_dir/phrases");821 open(DATA, ">$phind ex_dir/pdata.txt");822 open(IDX, ">$phind ex_dir/pword.txt");852 open(IN, "<$phind_dir/phrases"); 853 open(DATA, ">$phind_dir/pdata.txt"); 854 open(IDX, ">$phind_dir/pword.txt"); 823 855 824 856 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word); -
trunk/gsdl/perllib/mgbuilder.pm
r1799 r1803 139 139 140 140 # load all the classifiers 141 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $ outhandle);141 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle); 142 142 143 143 # load up any dontgdbm fields -
trunk/gsdl/src/phind/host/phindcgi.cpp
r1634 r1803 144 144 strcat(basepath, "/collect/"); 145 145 strcat(basepath, collection); 146 strcat(basepath, "/ phindex");146 strcat(basepath, "/index/phind"); 147 147 148 148 // If we don't know the phrase number, look itup … … 720 720 721 721 // mode 1 = casefolded, unstemmed search 722 QueryNode *queryTree = ParseQuery(query, 1 );722 QueryNode *queryTree = ParseQuery(query, 1, 1); 723 723 724 724 // cout << "-- query --" << endl;
Note:
See TracChangeset
for help on using the changeset viewer.