Changeset 1852
- Timestamp:
- 2001-01-22T15:30:56+13:00 (23 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgppbuilder.pm
r1772 r1852 1 1 ########################################################################### 2 2 # 3 # mg builder.pm -- MGBuilder object3 # mgppbuilder.pm -- MGBuilder object 4 4 # A component of the Greenstone digital library software 5 5 # from the New Zealand Digital Library Project at the … … 48 48 $maxdocsize = 12000; 49 49 50 #update this !!!!!!!!!!!!!!!! 50 51 51 %wanted_index_files = ('td'=>1, 52 52 't'=>1, 53 'tl'=>1, 54 'ti'=>1, 53 55 'idb'=>1, 54 56 'ib1'=>1, … … 56 58 'ib3'=>1, 57 59 'i'=>1, 58 'ip'=>1, 59 'tiw'=>1, 60 'il'=>1, 61 'tw'=>1, 62 'w'=>1, 60 63 'wa'=>1); 61 64 65 # change this so a user can add their own ones in via a file or cfg 66 %static_indexfield_map = ('Title'=>'TI', 67 'TI'=>1, 68 'Subject'=>'SU', 69 'SU'=>1, 70 'Creator'=>'CR', 71 'CR'=>1, 72 'Organization'=>'OR', 73 'OR'=>1, 74 'Source'=>'SO', 75 'SO'=>1, 76 'Howto'=>'HT', 77 'HT'=>1, 78 'ItemTitle'=>'IT', 79 'IT'=>1, 80 'ProgNumber'=>'PN', 81 'PN'=>1, 82 'People'=>'PE', 83 'PE'=>1, 84 'TextOnly'=>'TX', 85 'TX'=>1); 62 86 63 87 sub new { … … 77 101 'allclassifications'=>$allclassifications, 78 102 'outhandle'=>$outhandle, 79 'notbuilt'=>[] # indexes not built 80 }, $class; 81 103 'notbuilt'=>[], # indexes not built 104 'indexfieldmap'=>\%static_indexfield_map 105 }, $class; 106 82 107 83 108 # read in the collection configuration file … … 110 135 } 111 136 137 # get the levels (Section, Paragraph) for indexing and compression 138 $self->{'levels'} = {}; 139 if (defined $self->{'collect_cfg'}->{'levels'}) { 140 foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){ 141 $self->{'levels'}->{$level} = 1; 142 } 143 } 144 112 145 # get the list of plugins for this collection 113 146 my $plugins = []; … … 142 175 # load up the document processor for building 143 176 # if a buildproc class has been created for this collection, use it 144 # otherwise, use the mg buildproc177 # otherwise, use the mgpp buildproc 145 178 my ($buildprocdir, $buildproctype); 146 179 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { … … 175 208 } 176 209 177 sub build_collection { 178 my $self = shift (@_); 179 my ($textindex, $indexname) = @_; 180 181 my $outhandle = $self->{'outhandle'}; 182 183 print $outhandle "build_col, textindex=$textindex, indexname=$indexname\n"; 210 sub set_strip_html { 211 my $self = shift (@_); 212 my ($strip) = @_; 213 214 $self->{'strip_html'} = $strip; 215 $self->{'buildproc'}->set_strip_html($strip); 216 } 217 218 sub compress_text { 219 220 my $self = shift (@_); 221 my ($textindex) = @_; 222 184 223 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text"; 185 224 my $exe = &util::get_os_exe (); 186 187 225 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); 188 226 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe"); 189 my $mg_perf_hash_build_exe = 190 &util::filename_cat($exedir, "mg_perf_hash_build$exe"); 191 my $mg_weights_build_exe = 192 &util::filename_cat ($exedir, "mg_weights_build$exe"); 193 my $mg_invf_dict_exe = 194 &util::filename_cat ($exedir, "mg_invf_dict$exe"); 195 my $mg_stem_idx_exe = 196 &util::filename_cat ($exedir, "mg_stem_idx$exe"); 227 my $outhandle = $self->{'outhandle'}; 197 228 198 229 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text")); 199 my $basefilename = "$self->{'collection'}"; 200 # my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename); 201 # my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, 202 # $self->{'collection'}); 203 204 my $fulltextprefix=$self->{'build_dir'}; # note if this works, change all to $directory, change in mg calls!!!!!!!!!!!!!! 205 my $fullindexprefix=$self->{'build_dir'}; 206 207 my $directory = $self->{'build_dir'}; 208 my $osextra = ""; 209 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 210 $fulltextprefix =~ s/\//\\/g; 211 #$directory = ~s/\//\\/g; 212 } else { 213 $osextra = " -d /"; 214 } 215 216 #indexname got from command line arg. if not specified, its "", so use 217 # ones stated in cfg file 218 my $indexes = []; 219 if (!(defined $indexname && $indexname =~ /\w/)) { 220 $indexes = $self->{'collect_cfg'}->{'indexes'}; 221 $indexname="Title,Organization,Magazine,text"; 222 } 223 else { 224 push @$indexes, $indexname; 225 } 226 print $outhandle "indexes are: @$indexes\n"; 227 228 229 print $outhandle "\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1); 230 print $outhandle "fulltextprefix=$fulltextprefix\n"; 231 # carry out the first pass of mg_passes 232 # -b $maxdocsize sets the maximum document size to be 12 meg - not available any longer 233 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); 230 231 my $builddir = $self->{'build_dir'}; 232 my $basefilename = "text/$self->{'collection'}"; 233 234 # mgpp cant work on windows at the moment 235 # if ($ENV{'GSDLOS'} =~ /^windows$/i) { 236 # $basefilename =~ s/\//\\/g; 237 # $builddir =~ s/\//\\/g; 238 # 239 # } 240 241 242 # define the section names for mgpasses 243 # the compressor doesn't need to know about paragraphs - never want to 244 # retrieve them 245 my $mg_passes_sections = ""; 246 if ($self->{'levels'}->{'Section'}) { 247 $mg_passes_sections .= "-K Section "; 248 } 249 250 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); 251 252 # collect the statistics for the text 253 # -b $maxdocsize sets the maximum document size to be 12 meg 254 print $outhandle "\n collecting text statistics (mg_passes -T1)\n" if ($self->{'verbosity'} >= 1); 234 255 235 256 my ($handle); … … 238 259 } else { 239 260 if (!-e "$mg_passes_exe" || 240 !open (PIPEOUT, "| $mg_passes_exe -K Section -T1 -I1 -d $fulltextprefix -f $basefilename")) {261 !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -T1")) { 241 262 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n"; 242 263 } … … 244 265 } 245 266 246 247 #Assume that only going to build one index for now. so index will be248 # anything specified in cfg file249 267 $self->{'buildproc'}->set_output_handle ($handle); 250 268 $self->{'buildproc'}->set_mode ('text'); 251 $self->{'buildproc'}->set_index ($indexname); 252 $self->{'buildproc'}->set_indexing_text (1); # not used at the moment I think 269 $self->{'buildproc'}->set_index ($textindex); 270 $self->{'buildproc'}->set_indexing_text (0); 271 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 272 $self->{'buildproc'}->set_levels ($self->{'levels'}); 253 273 $self->{'buildproc'}->reset(); 254 274 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, … … 261 281 close ($handle) unless $self->{'debug'}; 262 282 263 264 # create the compression dictionary265 # the compression dictionary is built by assuming the stats are from a seed266 # dictionary (-S), if a novel word is encountered it is spelled out (-H),267 # and the resulting dictionary must be less than 5 meg with the most frequent268 # words being put into the dictionary first (-2 -k 5120)269 if (!$self->{'debug'}) {270 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);271 if (!-e "$mg_compression_dict_exe") {272 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";273 }274 system ("$mg_compression_dict_exe -d $fulltextprefix -f $basefilename");275 276 # create the perfect hash function277 if (!-e "$mg_perf_hash_build_exe") {278 die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";279 }280 system ("$mg_perf_hash_build_exe -d $fullindexprefix -f $basefilename");281 282 # compress the text283 # -b $maxdocsize sets the maximum document size to be 12 meg284 if (!$self->{'debug'}) {285 if (!-e "$mg_passes_exe" ||286 !open ($handle, "| $mg_passes_exe -K Section -d $fulltextprefix -f $basefilename -T2 -I2")) {287 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";288 }289 }290 }291 292 $self->{'buildproc'}->reset();293 294 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);295 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},296 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});297 close ($handle) unless $self->{'debug'};298 299 300 301 # create the weights file302 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);303 if (!-e "$mg_weights_build_exe") {304 die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";305 }306 system ("$mg_weights_build_exe -d $fullindexprefix -f $basefilename ");307 308 # create 'on-disk' stemmed dictionary309 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);310 if (!-e "$mg_invf_dict_exe") {311 die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";312 }313 system ("$mg_invf_dict_exe -d $fullindexprefix -f $basefilename");314 315 316 # creates stem index files for the various stemming methods317 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);318 if (!-e "$mg_stem_idx_exe") {319 die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";320 }321 system ("$mg_stem_idx_exe -b 4096 -s1 -d $fullindexprefix -f $basefilename");322 system ("$mg_stem_idx_exe -b 4096 -s2 -d $fullindexprefix -f $basefilename");323 system ("$mg_stem_idx_exe -b 4096 -s3 -d $fullindexprefix -f $basefilename");324 }325 326 327 #for mgpp with more than one index328 sub compress_text {329 330 my $self = shift (@_);331 my ($textindex) = @_;332 333 # $textindex = "Title,Organization,Subject,Magazine,text";334 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";335 my $exe = &util::get_os_exe ();336 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");337 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");338 my $outhandle = $self->{'outhandle'};339 340 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));341 342 my $builddir = $self->{'build_dir'};343 my $basefilename = "text/$self->{'collection'}";344 345 if ($ENV{'GSDLOS'} =~ /^windows$/i) {346 $basefilename =~ s/\//\\/g;347 $builddir =~ s/\//\\/g;348 349 }350 351 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);352 353 # collect the statistics for the text354 # -b $maxdocsize sets the maximum document size to be 12 meg355 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);356 357 my ($handle);358 if ($self->{'debug'}) {359 $handle = STDOUT;360 } else {361 if (!-e "$mg_passes_exe" ||362 !open (PIPEOUT, "| $mg_passes_exe -K Section -d $builddir -f $basefilename -T1")) {363 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";364 }365 $handle = mgppbuilder::PIPEOUT;366 }367 368 $self->{'buildproc'}->set_output_handle ($handle);369 $self->{'buildproc'}->set_mode ('text');370 $self->{'buildproc'}->set_index ($textindex);371 $self->{'buildproc'}->set_indexing_text (0);372 $self->{'buildproc'}->reset();373 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},374 $self->{'buildproc'}, $self->{'maxdocs'});375 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},376 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});377 &plugin::end($self->{'pluginfo'});378 close (PIPEOUT);379 380 close ($handle) unless $self->{'debug'};381 382 283 # create the compression dictionary 383 284 # the compression dictionary is built by assuming the stats are from a seed … … 385 286 # and the resulting dictionary must be less than 5 meg with the most 386 287 # frequent words being put into the dictionary first (-2 -k 5120) 387 # note: th isoptions are left over from mg version288 # note: these options are left over from mg version 388 289 if (!$self->{'debug'}) { 389 290 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); … … 396 297 if (!$self->{'debug'}) { 397 298 if (!-e "$mg_passes_exe" || 398 !open ($handle, "| $mg_passes_exe -K Section-f $basefilename -d $builddir -T2")) {299 !open ($handle, "| $mg_passes_exe $mg_passes_compress_sections -f $basefilename -d $builddir -T2")) { 399 300 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n"; 400 301 } … … 404 305 $self->{'buildproc'}->reset(); 405 306 # compress the text 406 print $outhandle "\n compressing the text \n" if ($self->{'verbosity'} >= 1);307 print $outhandle "\n compressing the text (mg_passes -T2)\n" if ($self->{'verbosity'} >= 1); 407 308 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 408 309 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); … … 439 340 $indexes = $self->{'collect_cfg'}->{'indexes'}; 440 341 } 441 442 # push @$indexes, "text,Title,Organization,Magazine,Subject";443 # push @$indexes, "Title,Organization,Magazine,Subject";444 342 445 343 # create the mapping between the index descriptions … … 593 491 my $exe = &util::get_os_exe (); 594 492 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); 493 494 # define the section names for mgpasses 495 my $mg_passes_sections = ""; 496 foreach $level (keys (%{$self->{'levels'}})) { 497 if ($level eq "Section" || $level eq "Paragraph") { 498 $mg_passes_sections .= "-K $level "; 499 } 500 } 501 595 502 my $mg_perf_hash_build_exe = 596 503 &util::filename_cat($exedir, "mg_perf_hash_build$exe"); … … 602 509 &util::filename_cat ($exedir, "mg_stem_idx$exe"); 603 510 604 if ($ENV{'GSDLOS'} =~ /^windows$/i) {605 $builddir=~ s/\//\\/g;606 $basefilename =~ s/\//\\/g;607 }511 # if ($ENV{'GSDLOS'} =~ /^windows$/i) { 512 # $builddir=~ s/\//\\/g; 513 # $basefilename =~ s/\//\\/g; 514 # } 608 515 609 516 # get the index expression if this index belongs … … 631 538 632 539 # Build index dictionary. Uses verbatim stem method 633 print $outhandle "\n creating index dictionary \n" if ($self->{'verbosity'} >= 1);540 print $outhandle "\n creating index dictionary (mg_passes -I1)\n" if ($self->{'verbosity'} >= 1); 634 541 my ($handle); 635 542 if ($self->{'debug'}) { … … 637 544 } else { 638 545 if (!-e "$mg_passes_exe" || 639 !open (PIPEOUT, "| $mg_passes_exe -K Section-d $builddir -f $basefilename -I1")) {546 !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -I1")) { 640 547 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n"; 641 548 } … … 648 555 $self->{'buildproc'}->set_index ($index, $indexexparr); 649 556 $self->{'buildproc'}->set_indexing_text (1); 650 557 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 558 $self->{'buildproc'}->set_levels ($self->{'levels'}); 651 559 $self->{'buildproc'}->reset(); 652 560 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, … … 664 572 665 573 if (!-e "$mg_passes_exe" || 666 !open ($handle, "| $mg_passes_exe -K Section-d $builddir -f $basefilename -I2")) {574 !open ($handle, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -I2")) { 667 575 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n"; 668 576 } … … 670 578 671 579 # invert the text 672 print $outhandle "\n inverting the text \n" if ($self->{'verbosity'} >= 1);580 print $outhandle "\n inverting the text (mg_passes -I2)\n" if ($self->{'verbosity'} >= 1); 673 581 674 582 $self->{'buildproc'}->reset(); … … 708 616 709 617 # remove unwanted files 710 #my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);711 #opendir (DIR, $tmpdir) || die712 #"mgppbuilder::build_index - couldn't read directory $tmpdir\n";713 #foreach $file (readdir(DIR)) {714 #next if $file =~ /^\./;715 #my ($suffix) = $file =~ /\.([^\.]+)$/;716 #if (defined $suffix && !defined $wanted_index_files{$suffix}) {618 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir); 619 opendir (DIR, $tmpdir) || die 620 "mgppbuilder::build_index - couldn't read directory $tmpdir\n"; 621 foreach $file (readdir(DIR)) { 622 next if $file =~ /^\./; 623 my ($suffix) = $file =~ /\.([^\.]+)$/; 624 if (defined $suffix && !defined $wanted_index_files{$suffix}) { 717 625 # delete it! 718 #print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;719 #&util::rm (&util::filename_cat ($tmpdir, $file));720 #}721 #}722 #closedir (DIR);626 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2; 627 &util::rm (&util::filename_cat ($tmpdir, $file)); 628 } 629 } 630 closedir (DIR); 723 631 } 724 632 } … … 767 675 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); 768 676 $self->{'buildproc'}->set_indexing_text (0); 677 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 678 769 679 $self->{'buildproc'}->reset(); 770 680 … … 791 701 } 792 702 } 703 #print out the indexfield mapping 704 foreach $field (keys(%{$self->{'indexfieldmap'}})) { 705 $shortname = $self->{'indexfieldmap'}->{$field}; 706 print $handle "<$shortname>$field\n"; 707 } 793 708 print $handle "\n" . ('-' x 70) . "\n"; 794 709 … … 851 766 $build_cfg->{'notbuilt'} = $self->{'notbuilt'}; 852 767 768 # store the indexfieldmap information 769 my @indexfieldmap = (); 770 #add all fields bit 771 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) { 772 push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}"); 773 } 774 775 $build_cfg->{'indexfieldmap'} = \@indexfieldmap; 853 776 854 777 #store the indexed field information … … 860 783 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg, 861 784 '^(builddate|buildtype|numdocs|numbytes)$', 862 '^(indexmap|subcollectionmap|languagemap| notbuilt|indexfields)$');785 '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$'); 863 786 864 787 } -
trunk/gsdl/perllib/mgppbuildproc.pm
r1772 r1852 1 1 ########################################################################### 2 2 # 3 # mg buildproc.pm --3 # mgppbuildproc.pm -- 4 4 # A component of the Greenstone digital library software 5 5 # from the New Zealand Digital Library Project at the … … 25 25 26 26 # This document processor outputs a document 27 # for mg to process27 # for mgpp to process 28 28 29 29 … … 67 67 $self->{'num_processed_bytes'} = 0; 68 68 $self->{'outhandle'} = $outhandle; 69 $self->{'dontindex'} = {}; 70 $self->{'indexfieldmap'} = {}; 69 71 70 72 $self->{'indexing_text'} = 0; 71 73 $self->{'indexfields'} = {}; 74 $self->{'strip_html'}=1; 75 72 76 73 77 return bless $self, $class; … … 171 175 172 176 return $self->{'indexing_text'}; 177 } 178 179 sub set_indexfieldmap { 180 my $self = shift (@_); 181 my ($indexmap) = @_; 182 183 $self->{'indexfieldmap'} = $indexmap; 184 } 185 186 sub get_indexfieldmap { 187 my $self = shift (@_); 188 189 return $self->{'indexfieldmap'}; 190 } 191 192 sub set_levels { 193 my $self = shift (@_); 194 my ($levels) = @_; 195 196 $self->{'levels'} = $levels; 197 } 198 199 sub set_strip_html { 200 my $self = shift (@_); 201 my ($strip) = @_; 202 $self->{'strip_html'}=$strip; 173 203 } 174 204 … … 238 268 my ($doc_obj, $filename) = @_; 239 269 my $handle = $self->{'output_handle'}; 240 # $handle = "main::STDOUT";241 270 242 271 my $doctype = $doc_obj->get_doc_type(); … … 244 273 # only output this document if it is one to be indexed 245 274 return if ($doctype ne "indexed_doc"); 275 276 #if a Section level index is not built, the gdbm file should be at doc 277 #level not Section 278 my $docs_only = 1; 279 if ($self->{'levels'}->{'Section'}) { 280 $docs_only = 0; 281 } 246 282 247 283 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/; … … 287 323 288 324 # output all the section metadata 289 #my $found_doctype = 0;290 325 my $metadata = $doc_obj->get_all_metadata ($section); 291 326 foreach $pair (@$metadata) { 292 327 my ($field, $value) = (@$pair); 293 328 294 #$found_doctype = 1 if $field eq "doctype";295 329 if ($field ne "Identifier" && $field !~ /^gsdl/ && 296 330 defined $value && $value ne "") { … … 315 349 } 316 350 317 # output the fact that this document is a document318 # (unless doctype was already output as part of319 # metadata)320 #if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {321 # print $handle "<doctype>doc\n";322 #}323 324 325 326 351 # output archivedir if at top level 327 352 if ($section eq $doc_obj->get_top_section()) { … … 334 359 } 335 360 336 # output a list of children 337 my $children = $doc_obj->get_children ($section); 338 if (scalar(@$children) > 0) { 339 print $handle "<childtype>$childtype\n"; 340 print $handle "<contains>"; 341 my $firstchild = 1; 342 foreach $child (@$children) { 343 print $handle ";" unless $firstchild; 344 $firstchild = 0; 345 if ($child =~ /^.*?\.(\d+)$/) { 346 print $handle "\".$1"; 347 } else { 348 print $handle "\".$child"; 349 } 361 if (!$docs_only) { 362 # output a list of children 363 my $children = $doc_obj->get_children ($section); 364 if (scalar(@$children) > 0) { 365 print $handle "<childtype>$childtype\n"; 366 print $handle "<contains>"; 367 my $firstchild = 1; 368 foreach $child (@$children) { 369 print $handle ";" unless $firstchild; 370 $firstchild = 0; 371 if ($child =~ /^.*?\.(\d+)$/) { 372 print $handle "\".$1"; 373 } else { 374 print $handle "\".$child"; 375 } 350 376 # if ($child eq "") { print $handle "$doc_OID"; } 351 377 # elsif ($section eq "") { print $handle "$doc_OID.$child"; } 352 378 # else { print $handle "$doc_OID.$section.$child"; } 353 } 354 print $handle "\n"; 355 } 356 357 # output the matching document number 358 print $handle "<docnum>$self->{'num_sections'}\n"; 359 379 } 380 print $handle "\n"; 381 } 382 #output the matching doc number 383 print $handle "<docnum>$self->{'num_sections'}\n"; 384 385 } # if (!$docs_only) 386 else { #docs only, doc num is num_docs not num_sections 387 # output the matching document number 388 print $handle "<docnum>$self->{'num_docs'}\n"; 389 } 390 360 391 print $handle '-' x 70, "\n"; 361 392 362 393 363 394 # output a database entry for the document number 364 print $handle "[$self->{'num_sections'}]\n"; 365 if ($section eq "") { print $handle "<section>$doc_OID\n"; } 366 else { print $handle "<section>$doc_OID.$section\n"; } 395 if ($docs_only) { 396 print $handle "[$self->{'num_docs'}]\n"; 397 print $handle "<section>$doc_OID\n"; 398 } 399 else { 400 print $handle "[$self->{'num_sections'}]\n"; 401 if ($section eq "") { print $handle "<section>$doc_OID\n"; } 402 else { print $handle "<section>$doc_OID.$section\n"; } 403 } 367 404 print $handle '-' x 70, "\n"; 368 405 … … 374 411 $first = 0; 375 412 $section = $doc_obj->get_next_section($section); 413 last if ($docs_only); # if no sections wanted, only gdbm the docs 376 414 } 377 415 … … 384 422 $_[1] =~ s/(<p\b)/<Paragraph>$1/gi; 385 423 } 424 425 #this function strips the html tags from the doc if ($strip_html) and 426 # if ($para) replaces <p> with <Paragraph> tags. 427 # if both are false, the original text is returned 428 #assumes that <pre> and </pre> have no spaces, and removes all < and > inside 429 #these tags 430 sub preprocess_text { 431 my $self = shift (@_); 432 my ($text, $strip_html, $para) = @_; 433 434 my ($outtext) = ""; 435 if ($strip_html) { 436 while ($text =~ /<([^>]*)>/ && $text ne "") { 437 438 $tag = $1; 439 $outtext .= $`." "; #add everything before the matched tag 440 $text = $'; #everything after the matched tag 441 if ($para && $tag =~ /^\s*p\s/) { 442 $outtext .= "<Paragraph> "; 443 } 444 elsif ($tag =~ /^pre$/) { # a pre tag 445 $text =~ /<\/pre>/; # find the closing pre tag 446 my $tmp_text = $`; #everything before the closing pre tag 447 $text = $'; #everything after the </pre> 448 $tmp_text =~ s/[<>]//g; # remove all < and > 449 $outtext.= $tmp_text . " "; 450 } 451 } 452 453 $outtext .= $text; # add any remaining text 454 return $outtext; 455 } #if strip_html 456 457 if ($para) { 458 $text =~ s/(<p\b)/<Paragraph>$1/gi; 459 return $text; 460 } 461 return $text; 462 } 463 464 386 465 387 466 sub filter_text { … … 436 515 # get the parameters for the output 437 516 my ($fields) = $self->{'index'}; 438 #print STDERR "fields are $fields\n"; 439 $fields =~ s/\ball\b/Title,Creator,text/; # add in others here 440 517 518 my ($sectiontag) = ""; 519 if ($self->{'levels'}->{'Section'}) { 520 $sectiontag = "\n<Section>\n"; 521 } 522 my ($paratag) = ""; 523 if ($self->{'levels'}->{'Paragraph'}) { 524 $paratag = "<Paragraph>"; 525 } 441 526 my $doc_section = 0; # just for this document 442 527 my $text = ""; … … 455 540 $doc_section++; 456 541 $self->{'num_sections'} += 1; 457 $text .= "<Section>\n"; 542 $text .= $sectiontag; 543 458 544 if ($indexed_doc) { 459 545 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); … … 464 550 if (!($real_field =~ s/^top//) || ($doc_section == 1)) { 465 551 my $new_text = ""; 552 my $tmp_text = ""; 466 553 if ($real_field eq "text") { 467 #print STDERR "in text bit"; 468 #$new_text = "<Paragraph>"; 469 $new_text .= $doc_obj->get_text ($section); 470 #$self->find_paragraphs($new_text); 554 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed 555 $new_text .= "<TX>\n"; 556 $tmp_text .= $doc_obj->get_text ($section); 557 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, $self->{'levels'}->{'Paragraph'}); 558 559 $new_text .= "$tmp_text</TX>\n"; 560 if (!defined $self->{'indexfields'}->{'TextOnly'}) { 561 $self->{'indexfields'}->{'TextOnly'} = 1; 562 } 563 } 564 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment 565 $new_text .= $doc_obj->get_text ($section); 566 #if ($self->{'levels'}->{'Paragraph'}) { 567 #$self->find_paragraphs($new_text); 568 #} 569 } 471 570 } else { # metadata field 472 571 if ($real_field eq "metadata") { # insert all metadata 473 474 #print STDERR "in metadata bit\n";572 #except gsdl stuff 573 my $shortname = ""; 475 574 my $metadata = $doc_obj->get_all_metadata ($section); 476 575 foreach $pair (@$metadata) { 477 576 my ($mfield, $mvalue) = (@$pair); 478 #print STDERR "$mfield, $mvalue\n"; 479 # check fields here, maybe others dont want 577 # check fields here, maybe others dont want - change to use dontindex!! 480 578 if ($mfield ne "Identifier" && $mfield ne "classifytype" && 481 579 $mfield !~ /^gsdl/ && defined $mvalue && $mvalue ne "") { 482 483 $new_text .= "<$mfield>$mvalue</$mfield>\n"; 484 #print STDERR "metadata=$mfield:$mvalue"; 485 if (!defined $self->{'indexfields'}->{$mfield}) { 486 $self->{'indexfields'}->{$mfield} = 1; 487 } 580 581 if (defined $self->{'indexfieldmap'}->{$mfield}) { 582 $shortname = $self->{'indexfieldmap'}->{$mfield}; 583 } 584 else { 585 $shortname = $self->create_shortname($mfield); 586 $self->{'indexfieldmap'}->{$mfield} = $shortname; 587 $self->{'indexfieldmap'}->{$shortname} = 1; 588 } 589 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n"; 590 if (!defined $self->{'indexfields'}->{$mfield}) { 591 $self->{'indexfields'}->{$mfield} = 1; 592 } 488 593 } 489 594 } 490 595 491 596 } 492 597 else { #individual metadata specified 598 my $shortname=""; 493 599 if (!defined $self->{'indexfields'}->{$real_field}) { 494 600 $self->{'indexfields'}->{$real_field} = 1; 495 } 601 } 602 if (defined $self->{'indexfieldmap'}->{$real_field}) { 603 $shortname = $self->{'indexfieldmap'}->{$real_field}; 604 } 605 else { 606 $shortname = $self->create_shortname($real_field); 607 $self->{'indexfieldmap'}->{$real_field} = $shortname; 608 $self->{'indexfieldmap'}->{$shortname} = 1; 609 } 496 610 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) { 497 $new_text .= " <$real_field>$item</$real_field>\n";611 $new_text .= "$paratag<$shortname>$item</$shortname>\n"; 498 612 } 499 613 } … … 508 622 $new_text =~ /[\(\)\{\}]/) { 509 623 } 510 624 $self->{'num_processed_bytes'} += length ($new_text); 511 625 $text .= "$new_text"; 512 626 } … … 519 633 } 520 634 635 sub create_shortname { 636 $self = shift(@_); 637 638 my ($realname) = @_; 639 #take the first two chars 640 my ($shortname) = $realname =~ /^(\w\w)/; 641 $shortname =~ tr/a-z/A-Z/; 642 643 #if already used, take the first and third letters and so on 644 $count = 1; 645 while (defined $self->{'indexfieldmap'}->{$shortname}) { 646 if ($realname =~ /^(\w).{$count}(\w)/) { 647 $shortname = "$1$2"; 648 $count++; 649 $shortname =~ tr/a-z/A-Z/; 650 651 } 652 else { 653 $realname =~ s/^.//; 654 $count = 0; 655 } 656 } 657 658 return $shortname; 659 } 660 521 661 1; 522 662
Note:
See TracChangeset
for help on using the changeset viewer.