Changeset 1604 for trunk/gsdl/src/phind
- Timestamp:
- 2000-10-17T12:35:59+13:00 (24 years ago)
- Location:
- trunk/gsdl/src/phind/generate
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/phind/generate/phindgen.pl
r1591 r1604 79 79 80 80 my ($verbosity, $archivedir, $phindexdir, 81 $phind , $language, $maxdocs, $untidy,81 $phindcfg, $language, $maxdocs, $untidy, 82 82 $collection, $configfilename, $collectcfg); 83 83 … … 110 110 } 111 111 if (defined $collectcfg->{'phind'}) { 112 $phind = $collectcfg->{'phind'};112 $phindcfg = $collectcfg->{'phind'}; 113 113 } 114 114 } else { … … 125 125 126 126 # Make sure theuser has in fact requested phind indexes 127 if (!defined($phind )) {127 if (!defined($phindcfg)) { 128 128 print "No phind information in $configfilename\n"; 129 129 exit; … … 144 144 # Read the archives directory and build the clauses file 145 145 print "\nReading archive directory\n" if $verbosity; 146 &build_clauses($archivedir, $phindexdir, $language, $ verbosity, $maxdocs);146 &build_clauses($archivedir, $phindexdir, $language, $phindcfg, $verbosity, $maxdocs); 147 147 148 148 # Generate the vocabulary, symbol statistics, and numbers file … … 152 152 153 153 # Use the suffix program to generate the phindex/phrases file 154 $command = "suffix $phindexdir $symbol_limit $mode"; 155 print "\nExecuting: $command\n" if $verbosity; 156 $status = system($command); 157 if ($status != 0) { 158 print STDERR "phindgen.pl - Error executing $command: $!\n"; 159 exit($status); 160 } 154 &execute("suffix $phindexdir $symbol_limit $mode", $verbosity); 161 155 162 156 # Create the phrase file and put phrase numbers in phindex/phrases … … 168 162 my $mg_passes = &util::filename_cat($mgpp, "text", "mg_passes"); 169 163 my $mg_compression_dict = &util::filename_cat($mgpp, "text", "mg_compression_dict"); 170 my $mg_input = &util::filename_cat($phindexdir, "mg-p.txt"); 171 172 $command = "$mg_passes -d $phindexdir -f phrase -T1 $mg_input"; 173 print "\nExecuting: $command\n" if $verbosity; 174 $status = system($command); 175 if ($status != 0) { 176 print STDERR "phindgen.pl - Error executing $command: $!\n"; 177 exit($status); 178 } 179 180 $command = "$mg_compression_dict -d $phindexdir -f phrase"; 181 print "\nExecuting: $command\n" if $verbosity; 182 $status = system($command); 183 if ($status != 0) { 184 print STDERR "phindgen.pl - Error executing $command: $!\n"; 185 exit($status); 186 } 187 188 $command = "$mg_passes -d $phindexdir -f phrase -T2 $mg_input"; 189 print "\nExecuting: $command\n" if $verbosity; 190 $status = system($command); 191 if ($status != 0) { 192 print STDERR "phindgen.pl - Error executing $command: $!\n"; 193 exit($status); 194 } 164 165 my $mg_perf_hash_build = &util::filename_cat($mgpp, "text", "mg_perf_hash_build"); 166 my $mg_weights_build = &util::filename_cat($mgpp, "text", "mg_weights_build"); 167 my $mg_invf_dict = &util::filename_cat($mgpp, "text", "mg_invf_dict"); 168 my $mg_stem_idx = &util::filename_cat($mgpp, "text", "mg_stem_idx"); 169 170 print "\nCreating phrase databases\n"; 171 my $mg_input = &util::filename_cat($phindexdir, "pdata.txt"); 172 my $mg_stem = "pdata"; 173 174 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity); 175 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity); 176 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity); 177 178 # create the mg index of words 179 print "\nCreating word-level search indexes\n"; 180 $mg_input = &util::filename_cat($phindexdir, "pword.txt"); 181 $mg_stem = "pword"; 182 183 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 -I1 $mg_input", $verbosity); 184 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity); 185 &execute("$mg_perf_hash_build -d $phindexdir -f $mg_stem", $verbosity); 186 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 -I2 $mg_input", $verbosity); 187 &execute("$mg_weights_build -d $phindexdir -f $mg_stem", $verbosity); 188 &execute("$mg_invf_dict -d $phindexdir -f $mg_stem", $verbosity); 189 190 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 1", $verbosity); 191 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 2", $verbosity); 192 &execute("$mg_stem_idx -d $phindexdir -f $mg_stem -s 3", $verbosity); 193 194 # create the mg document information database 195 print "\nCreating document information databases\n"; 196 $mg_input = &util::filename_cat($phindexdir, "docs.txt"); 197 $mg_stem = "docs"; 198 199 &execute("$mg_passes -d $phindexdir -f $mg_stem -T1 $mg_input", $verbosity); 200 &execute("$mg_compression_dict -d $phindexdir -f $mg_stem", $verbosity); 201 &execute("$mg_passes -d $phindexdir -f $mg_stem -T2 $mg_input", $verbosity); 202 195 203 196 204 # Tidy up stray files … … 199 207 &util::rm("$phindexdir/clauses", "$phindexdir/clauses.numbers", 200 208 "$phindexdir/clauses.vocab", "$phindexdir/clauses.stats", 201 "$phindexdir/phrases", "$phindexdir/ mg-p.txt");209 "$phindexdir/phrases", "$phindexdir/docs.txt"); 202 210 my $outfile = 1; 203 211 while (-e "$phindexdir/outPhrase.$outfile") { … … 208 216 } 209 217 210 218 # Execute a system command 219 220 sub execute { 221 my ($command, $verbosity) = @_; 222 print "Executing: $command\n" if $verbosity; 223 my $status = system($command); 224 if ($status != 0) { 225 print STDERR "phindgen.pl - Error executing $command: $!\n"; 226 exit($status); 227 } 228 } 211 229 212 230 … … 218 236 219 237 sub build_clauses { 220 my ($archive_dir, $phindex_dir, $language, $ verbosity, $maxdocs) = @_;238 my ($archive_dir, $phindex_dir, $language, $phindcfg, $verbosity, $maxdocs) = @_; 221 239 222 240 # create a "pluginfo" for ArcPlug and RecPlug … … 227 245 228 246 # create a phind document processor object to process the documents 229 my $processor = new phproc ($archive_dir, $phindex_dir, $ language,247 my $processor = new phproc ($archive_dir, $phindex_dir, $phindcfg, $language, 230 248 $doclimit, $verbosity, "STDOUT"); 231 249 … … 476 494 477 495 # Sort the phrases into order of increasing frequency 496 # This means the expansions will be sorted correctly later on. 478 497 print "Sorting phrases into freq order\n" if ($verbosity); 479 498 system("sort -rnt ':' +2 -o $phindex_dir/phrases $phindex_dir/phrases"); … … 495 514 # 496 515 # The phrases file looks something like this 497 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254 498 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380 499 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374 500 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361 501 502 516 # 159396-1:s5175:4:1:116149-2:3:d2240,2;d2253;d2254 517 # 159409-1:s5263:6:1:159410-2:6:d2122;d2128;d2129;d2130;d2215;d2380 518 # 159415-1:s5267:9:1:159418-2:8:d3,2;d632;d633;d668;d1934;d2010;d2281;d2374 519 # 159426-1:s5273:5:2:159429-2,115168-17:5:d252;d815;d938;d939;d2361 520 521 # The first field on each line is a unique phrase identifier. 522 # We need to calculate phrase numbers for each phrase 503 523 print "Calculate phrase numbers\n" if ($verbosity); 504 524 505 open(IN, "<$phindex_dir/phrases");506 507 525 my %phrasenumber; 508 526 my $nextphrase = 1; 509 510 my ($line, $num);511 527 my ($line); 528 529 open(IN, "<$phindex_dir/phrases"); 512 530 while(<IN>) { 513 531 … … 516 534 $line = $_; 517 535 518 # we're only interested in the rfirst field536 # we're only interested in the first field 519 537 $line =~ s/:.*//; 520 538 521 539 # get a phrase number for this line 522 $num = $nextphrase; 523 $phrasenumber{$line} = $num; 540 $phrasenumber{$line} = $nextphrase; 524 541 $nextphrase++; 525 542 } 526 543 527 # Extract the phrase data 528 print "Create phrase file and frequency file\n" if ($verbosity); 544 545 # Now we create a new phrase file using phrase numbers, not the old IDs. 546 print "Format phrase data for MGPP\n" if ($verbosity); 529 547 530 548 open(IN, "<$phindex_dir/phrases"); 531 open(DATA, ">$phindex_dir/mg-p.txt"); 532 533 my ($key, $tf, $countexp, $expansions, $countdocs, $documents, $text); 549 open(DATA, ">$phindex_dir/pdata.txt"); 550 open(IDX, ">$phindex_dir/pword.txt"); 551 552 my ($key, $tf, $num, $countexp, $expansions, $countdocs, $documents, $text, $word); 534 553 my @fields; 535 554 my @documents; … … 545 564 @fields = split(/:/, $line); 546 565 547 # output the MG document tag548 print DATA "<Document>";549 550 566 # get a phrase number for this line 551 567 $key = shift @fields; 552 568 die unless (defined($phrasenumber{$key})); 553 569 $num = $phrasenumber{$key}; 554 print DATA "$num:";555 570 556 571 # get the text of the phrase 557 572 $text = shift @fields; 558 573 $text =~ s/s(\d+)/$symbol[$1]/g; 559 print DATA "$text:"; 560 574 if ($text =~ / /) { 575 $word = ""; 576 } else { 577 $word = $text; 578 } 579 561 580 $linenumber++; 562 581 if ($linenumber % 1000 == 0) { … … 567 586 # get the phrase frequency 568 587 $tf = shift @fields; 569 print DATA "$tf:";570 588 571 589 # get the number of expansions 572 590 $countexp = shift @fields; 573 print DATA "$countexp:";574 591 575 592 # get the expansions and convert them into phrase numbers … … 581 598 push @newexp, $n; 582 599 } 583 print DATA join(",", (sort numerically @newexp)), ":";584 600 @newexp = sort numerically @newexp; 601 585 602 # get the number of documents 586 603 $countdocs = shift @fields; 587 print DATA "$countdocs:";588 604 589 605 # get the documents … … 591 607 $documents =~ s/d//g; 592 608 @documents = split(/;/, $documents); 593 594 print DATA join(";", (sort by_frequency @documents)); 595 596 print DATA "\n"; 609 @documents = sort by_frequency @documents; 610 611 # output the phrase data 612 print DATA "<Document>"; 613 print DATA "$num:$text:$tf:$countexp:$countdocs:"; 614 print DATA join(",", @newexp), ":", join(";", @documents), "\n"; 615 616 # output the word index search data 617 print IDX "<Document>$word\n"; 618 597 619 598 620 } -
trunk/gsdl/src/phind/generate/phproc.pm
r1562 r1604 38 38 39 39 sub new { 40 my ($class, $archive_dir, $phindex_dir, 40 my ($class, $archive_dir, $phindex_dir, $phindcfg, 41 41 $language, $delimiter, $verbosity, $outhandle) = @_; 42 42 43 43 my $self = new docproc (); 44 44 45 # $self->{'collection'} = $collection;46 45 $self->{'archive_dir'} = $archive_dir; 47 46 $self->{'phindex_dir'} = $phindex_dir; 47 $self->{'indexes'} = $phindcfg; 48 48 49 49 $language =~ s/,/\|/g; … … 60 60 $self->{'txthandle'} = TEXT; 61 61 62 &util::rm("$phindex_dir/mg-d.txt") if (-e "$phindex_dir/mg-d.txt"); 63 open(DOCS, ">$phindex_dir/mg-d.txt") 64 || die "Cannot open $phindex_dir/mg-d.txt: $!"; 62 my $docfile = &util::filename_cat("$phindex_dir", "docs.txt"); 63 &util::rm($docfile) if (-e $docfile); 64 open(DOCS, ">$docfile") 65 || die "Cannot open $docfile: $!"; 65 66 $self->{'dochandle'} = DOCS; 66 67 … … 95 96 my $dochandle = $self->{'dochandle'}; 96 97 # print "dochandle: =$dochandle=\n"; 97 print $dochandle "$OID\t$title\n"; 98 99 # store the text 100 $text = convert_gml_to_tokens($doc_obj->get_text()); 101 98 print $dochandle "<Document>\t$OID\t$title\n"; 99 100 # XXX 101 # Store the text of this object 102 my $indexlist = $self->{'indexes'}; 103 my @parts; 104 my ($index, $part, $level, $field, $section, $data, $text); 105 106 # Output the document delimiter 102 107 my $txthandle = $self->{'txthandle'}; 103 print $txthandle $self->{'delimiter'}, "\n$text\n"; 108 print $txthandle $self->{'delimiter'}, "\n"; 109 110 # Iterarate over all the indexes specified in collect.cfg and 111 # add their text to the clauses file. 112 foreach $index (@$indexlist) { 113 $text = ""; 114 115 # Iterate over all the feilds in each index 116 @parts = split(/,/, $index); 117 foreach $part (@parts) { 118 119 # Each field has a level and a data element ((e.g. document:Title) 120 ($level, $field) = split(/:/, $part); 121 die unless ($level && $field); 122 123 # Extract the text from every section 124 # (In phind, document:text and section:text are equivalent) 125 if ($field eq "text") { 126 $data = ""; 127 $section = $doc_obj->get_top_section(); 128 while (defined($section)) { 129 $data .= $doc_obj->get_text($section) . "\n"; 130 $section = $doc_obj->get_next_section($section); 131 } 132 $text .= convert_gml_to_tokens($data) . "\n"; 133 } 134 135 # Extract a metadata field from a document 136 elsif ($level eq "document") { 137 $data = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field); 138 $text .= convert_gml_to_tokens($data) . "\n"; 139 } 140 141 # Extract metadata from every section in a document 142 elsif ($level eq "section") { 143 $data = ""; 144 $section = $doc_obj->get_top_section(); 145 while (defined($section)) { 146 $data .= $doc_obj->get_metadata_element($section, $field) . "\n"; 147 $section = $doc_obj->get_next_section($section); 148 } 149 $text .= convert_gml_to_tokens($data) . "\n"; 150 } 151 152 # Some sort of specification which I don't understand 153 else { 154 die "Unknown level ($level) in phind key ($part) in phind index ($index)\n"; 155 } 156 157 } 158 159 # print the text 160 print $txthandle "$text"; 161 162 } 104 163 } 105 164
Note:
See TracChangeset
for help on using the changeset viewer.