Changeset 12844
- Timestamp:
- 2006-09-25T14:17:10+12:00 (18 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 26 added
- 9 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/buildcol.pl
r12821 r12844 1 1 #!/usr/bin/perl -w 2 2 3 ########################################################################### 4 # 5 # buildcol.pl -- This program will build a particular collection 3 ## @file buildcol.pl 4 # This program will build a particular collection. 6 5 # A component of the Greenstone digital library software 7 6 # from the New Zealand Digital Library Project at the 8 7 # University of Waikato, New Zealand. 9 #10 # Copyright (C) 1999 New Zealand Digital Library Project11 8 # 12 9 # This program is free software; you can redistribute it and/or modify … … 24 21 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 25 22 # 26 ########################################################################### 27 28 # 11/04/03 Added usage datastructure - John Thompson 29 23 # @note 11/04/03 Added usage datastructure - John Thompson 24 # 25 # @author New Zealand Digital Library Project unless otherwise stated 26 # @copy 1999 New Zealand Digital Library Project 27 # 30 28 package buildcol; 31 29 … … 194 192 'type' => "flag", 195 193 'reqd' => "no", 196 'hiddengli' => "yes" } ]; 194 'hiddengli' => "yes" }, 195 { 'name' => "incremental", 196 'desc' => "{buildcol.incremental}", 197 'type' => "flag", 198 'reqd' => "no", 199 'modegli' => "3" } ]; 197 200 198 201 my $options = { 'name' => "buildcol.pl", … … 206 209 my $out; 207 210 208 sub gsprintf 211 ## @method gsprintf() 212 # Print a string to the screen after looking it up from a locale dependant 213 # strings file. This function is losely based on the idea of resource 214 # bundles as used in Java. 215 # 216 # @param $error The STDERR stream. 217 # @param $text The string containing GS keys that should be replaced with 218 # their locale dependant equivilents. 219 # @param $out The output stream. 220 # @return The locale-based string to output. 221 # 222 sub gsprintf() 209 223 { 210 224 return &gsprintf::gsprintf(@_); 211 225 } 212 213 226 ## gsprintf() ## 214 227 215 228 &main(); 216 229 230 ## @method main() 231 # 232 # [Parses up and validates the arguments to the build process before creating 233 # the appropriate build process to do the actual work - John] 234 # 235 # @note Added true incremental support - John Thompson, DL Consulting Ltd. 236 # @note There were several bugs regarding using directories other than 237 # "import" or "archives" during import and build quashed. - John 238 # Thompson, DL Consulting Ltd. 239 # 240 # @param $incremental If true indicates this build should not regenerate all 241 # the index and metadata files, and should instead just 242 # append the information found in the archives directory 243 # to the existing files. If this requires some complex 244 # work so as to correctly insert into a classifier so be 245 # it. Of course none of this is done here - instead the 246 # incremental argument is passed to the document 247 # processor. 248 # 217 249 sub main 218 250 { … … 222 254 $create_images, $collectdir, $build, $type, $textindex, 223 255 $no_strip_html, $no_text, $faillog, $gli, $index, $language, 224 $sections_index_document_metadata, $maxnumeric );256 $sections_index_document_metadata, $maxnumeric, $incremental); 225 257 226 258 my $xml = 0; … … 296 328 297 329 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib"); 330 # Don't know why this didn't already happen, but now collection specific 331 # classify and plugins directory also added to include path 332 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/classify"); 333 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins"); 298 334 299 335 # read the configuration file … … 398 434 $gli = 0 unless defined $gli; 399 435 436 # New argument to track whether build is incremental 437 $incremental = 0 unless defined $incremental; 438 400 439 print STDERR "<Build>\n" if $gli; 401 440 … … 421 460 # were supplied, turn all \ into / and remove trailing / 422 461 423 my ($realarchivedir, $realbuilddir); 424 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq ""; 462 my ($realarchivedir, $realbuilddir); 463 # Modified so that the archivedir, if provided as an argument, is made 464 # absolute if it isn't already 465 if ($archivedir eq "") 466 { 467 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives"); 468 } 469 else 470 { 471 $archivedir = &make_absolute($ENV{'GSDLCOLLECTDIR'}, $archivedir); 472 } 473 # End Mod 425 474 $archivedir =~ s/[\\\/]+/\//g; 426 475 $archivedir =~ s/\/$//; … … 476 525 "\$realarchivedir, \$realbuilddir, \$verbosity, " . 477 526 "\$maxdocs, \$debug, \$keepold, \$remove_empty_classifications, " . 478 "\$out, \$no_text, \$faillog, \$gli )");527 "\$out, \$no_text, \$faillog, \$gli, \$incremental)"); 479 528 die "$@" if $@; 480 529 … … 519 568 print STDERR "</Build>\n" if $gli; 520 569 } 521 570 ## main() ## 571 572 ## @method create_images() 573 # 574 # [Used to create default cover images... from what I'm not quite sure - John] 575 # 522 576 sub create_images { 523 577 my ($collection_name) = @_; … … 571 625 close CFGFILE; 572 626 } 627 ## create_images() ## 628 629 ## @method make_absolute() 630 # 631 # Ensure the given file path is absolute in respect to the given base path. 632 # 633 # @param $base_dir A string denoting the base path the given dir must be 634 # abolsute to. 635 # @param $dir The directory to be made absolute as a string. Note that the 636 # dir may already be absolute, in which case it will remain 637 # unchanged. 638 # @return The now absolute form of the directory as a string. 639 # 640 # @author John Thompson, DL Consulting Ltd. 641 # @copy 2006 DL Consulting Ltd. 642 # 643 sub make_absolute() 644 { 645 my ($base_dir, $dir) = @_; 646 $dir = $base_dir . "/$dir" if ($dir =~ m#^[^/]#); 647 $dir =~ s|^/tmp_mnt||; 648 1 while($dir =~ s|/[^/]*/\.\./|/|g); 649 $dir =~ s|/[.][.]?/|/|g; 650 $dir =~ tr|/|/|s; 651 return $dir; 652 } 653 ## make_absolute() ## -
trunk/gsdl/bin/script/lucene_passes.pl
r12258 r12844 5 5 # lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene 6 6 # A component of the Greenstone digital library software 7 # from the New Zealand Digital Library Project at the 7 # from the New Zealand Digital Library Project at the 8 8 # University of Waikato, New Zealand. 9 9 # … … 39 39 use ghtml; 40 40 41 42 41 sub open_java_lucene 43 42 { 44 my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_; 45 46 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java"); 47 my $classpath = &util::filename_cat($bin_java,"LuceneWrap.jar"); 48 49 my $java_lucene = "java -classpath \"$classpath\" org.nzdl.gsdl.LuceneWrap.GS2LuceneIndexer"; 50 my $cmd_options = "$create -verbosity $verbosity"; 51 my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir"; 52 53 if (!open (PIPEOUT, "| $java_cmd")) { 54 die "$PROGNAME - couldn't run $java_cmd\n"; 55 } 43 my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_; 44 45 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java"); 46 my $classpath = &util::filename_cat($bin_java,"LuceneWrap.jar"); 47 48 my $java_lucene = "java -classpath \"$classpath\" org.nzdl.gsdl.LuceneWrap.GS2LuceneIndexer"; 49 my $cmd_options = "$create -verbosity $verbosity"; 50 my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir"; 51 52 open (PIPEOUT, "| $java_cmd") or die "$PROGNAME - couldn't run $java_cmd\n"; 56 53 } 57 54 58 55 sub close_java_lucene 59 56 { 60 57 close(PIPEOUT); 61 58 } 62 59 … … 66 63 $dir_sep = &util::get_os_dirsep(); 67 64 68 my $full_output_filename 69 = &util::filename_cat($full_textdir,$output_filename); 65 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename); 70 66 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x); 71 67 &util::mk_all_dir($full_output_dir); 72 73 open(DOCOUT,">$full_output_filename") 68 69 open(DOCOUT,">$full_output_filename") 74 70 || die "Unable to open $full_output_filename"; 75 71 … … 78 74 79 75 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg); 80 81 82 # Currently not used, but consult with DB before removing83 # foreach my $sec (@secs) {84 # my ($docnum,$sec_text) = ($sec =~ m/<Sec\s+gs2:id="(\d+)"\s*>(.*?)<\/Sec>/s);85 # my $docnum_filename86 # = &util::filename_cat($full_textdir,"$docnum.xml");87 88 #89 # open(SECOUT,">$docnum_filename")90 # || die "Unable to open $docnum_filename";91 92 # print SECOUT &ghtml::unescape_html($sec_text);93 # close(SECOUT);94 # }95 96 76 } 97 77 … … 100 80 my ($full_textdir,$output_filename) = @_; 101 81 102 my $full_output_filename 82 my $full_output_filename 103 83 = &util::filename_cat($full_textdir,$output_filename); 104 84 … … 106 86 } 107 87 88 # This appears to be the callback that gets the xml stream during the 89 # build process, so I need to intercept it here and call my XML RPC 90 # to insert into the Lucene database. 108 91 sub monitor_xml_stream 109 92 { … … 150 133 } 151 134 135 136 # /** This checks the arguments on the command line, filters the 137 # * unknown command line arguments and then calls the open_java_lucene 138 # * function to begin processing. Most of the arguments are passed on 139 # * the command line of the java wrapper. 140 # * 141 # * Do not set -create and -remove at the same time, although -create is 142 # * required for -remove, -remove will set it it's self, if you set -create 143 # * after -remove the create will be ignored. 144 # * 145 # * @version 2.0 Added support for removing documents from the index by John Rowe 146 # * 147 # * @author John Rowe, DL Consulting 148 # */ 152 149 sub main 153 150 { 154 my (@argv) = @_; 155 my $argc = scalar(@argv); 156 157 my $create = ""; 158 my $verbosity = 1; 159 160 my @filtered_argv = (); 161 162 my $i = 0; 163 while ($i<$argc) { 164 if ($argv[$i] =~ m/^-(.*)$/) { 165 166 my $option = $1; 167 168 # -create causes build to be incremental 169 if ($option eq ("create")) { 170 $create = "-create"; 171 } 172 173 # -verbosity num 174 elsif ($option eq "verbosity") { 175 $i++; 176 if ($i<$argc) { 177 $verbosity = $argv[$i]; 178 } 179 } 180 else { 181 print STDERR "Unrecognised minus option: -$option\n"; 182 } 183 } 184 else { 185 push(@filtered_argv,$argv[$i]); 186 } 187 $i++; 151 my (@argv) = @_; 152 my $argc = scalar(@argv); 153 154 my $create = ""; 155 my $verbosity = 1; 156 157 my @filtered_argv = (); 158 159 my $i = 0; 160 while ($i<$argc) { 161 if ($argv[$i] =~ m/^\-(.*)$/) { 162 163 my $option = $1; 164 165 # -create causes build to be incremental 166 if ($option eq ("create")) { 167 print STDERR "\n\n-create set\n"; 168 $create = "-create"; 169 } 170 # In a blinding flash of unintuitiveness -remove causes 171 # -create to be set (we don't want to remove the old indexes) 172 elsif($option eq "remove") 173 { 174 # Look at the next arg for the oid and if that doesn't exist then 175 $i++; 176 if(!defined $argv[$i]) 177 { 178 print STDERR "Remove was specified but the OID was not specified"; 179 die "\n\nCannot continue"; 180 } 181 $removeoid = $argv[$i]; 182 print STDERR "\n\nWe're removing the document with id: '$removeoid'\n"; 183 184 # Now, to make sure this gets through to the Java executable 185 $create = "-create -remove '$removeoid'"; 186 } 187 188 # -verbosity num 189 elsif ($option eq "verbosity") { 190 $i++; 191 if ($i<$argc) { 192 $verbosity = $argv[$i]; 193 } 194 } 195 else { 196 print STDERR "Unrecognised minus option: -$option\n"; 197 } 188 198 } 189 190 my $filtered_argc = scalar(@filtered_argv); 191 192 if ($filtered_argc < 4) { 193 print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n"; 194 exit 1; 199 else { 200 push(@filtered_argv,$argv[$i]); 195 201 } 196 197 my $mode = $filtered_argv[0]; 198 my $doc_tag_level = $filtered_argv[1]; 199 my $full_builddir = $filtered_argv[2]; 200 my $indexdir = $filtered_argv[3]; 202 $i++; 203 } 204 205 my $filtered_argc = scalar(@filtered_argv); 206 207 if ($filtered_argc < 4) { 208 print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n"; 209 exit 1; 210 } 211 212 my $mode = $filtered_argv[0]; 213 my $doc_tag_level = $filtered_argv[1]; 214 my $full_builddir = $filtered_argv[2]; 215 my $indexdir = $filtered_argv[3]; 201 216 ### print STDERR "**** ARGS = ", join(" ", @argv), "\n"; 202 217 203 my $full_textdir = &util::filename_cat($full_builddir,"text"); 204 205 if ($mode eq "index") { 206 # don't need the lucene stuff if we are just storing the docs 207 open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity); 208 } 209 monitor_xml_stream($mode, $full_textdir); 210 if ($mode eq "index") { 211 close_java_lucene(); 212 } 218 my $full_textdir = &util::filename_cat($full_builddir,"text"); 219 220 if ($mode eq "index") { 221 # don't need the lucene stuff if we are just storing the docs 222 open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity); 223 } 224 print STDERR "Monitoring for input!\n"; 225 monitor_xml_stream($mode, $full_textdir); 226 if ($mode eq "index") { 227 close_java_lucene(); 228 } 213 229 } 214 230 -
trunk/gsdl/perllib/basebuilder.pm
r12340 r12844 287 287 my $outhandle = $self->{'outhandle'}; 288 288 289 print STDERR "BuildDir: $self->{'build_dir'}\n"; 290 289 291 my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); 290 292 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); … … 361 363 $self->{'gli'}); 362 364 365 # Output classifier reverse lookup, used in incremental deletion 366 #&classify::print_reverse_lookup($handle); 363 367 364 368 #output doclist … … 401 405 # store the number of documents and number of bytes 402 406 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs(); 407 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections(); 403 408 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); 404 405 409 406 410 # store the mapping between the index names and the directory names -
trunk/gsdl/perllib/basebuildproc.pm
r11994 r12844 40 40 } 41 41 42 sub new {43 my ($class, $collection, $source_dir, $build_dir, $keepold,44 $verbosity, $outhandle) = @_; 42 sub new() 43 { 44 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_; 45 45 my $self = new docproc (); 46 46 … … 71 71 # For incremental building need to seed num_docs etc from values 72 72 # stored in build.cfg (if present) 73 73 print STDERR "Keepold!\n"; 74 74 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg"); 75 75 print STDERR "Build cfg: $buildconfigfile\n"; 76 76 if (-e $buildconfigfile) { 77 77 $found_num_data = 1; … … 81 81 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 82 82 "index", "build.cfg"); 83 print STDERR "Index cfg: $buildconfigfile\n"; 83 84 if (-e $buildconfigfile) { 84 85 $found_num_data = 1; … … 86 87 } 87 88 88 } 89 90 if ($found_num_data) { 89 } 90 #else 91 # { 92 # print STDERR "Removeold!\n"; 93 # } 94 95 if ($found_num_data) 96 { 97 #print STDERR "Found_Num_Data!\n"; 91 98 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile); 92 93 99 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'}; 100 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n"; 94 101 $self->{'starting_num_sections'} = $buildcfg->{'numsections'}; 102 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n"; 95 103 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'}; 96 } 97 else { 98 $self->{'starting_num_docs'} = 0; 104 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n"; 105 } 106 else 107 { 108 #print STDERR "NOT Found_Num_Data!\n"; 109 $self->{'starting_num_docs'} = 0; 99 110 $self->{'starting_num_sections'} = 0; 100 111 $self->{'starting_num_bytes'} = 0; 101 }112 } 102 113 103 114 $self->{'output_handle'} = "STDOUT"; … … 428 439 # output the matching document number 429 440 print $handle "<docnum>$self->{'num_docs'}\n"; 441 430 442 } else { 431 443 # output a list of children … … 612 624 } 613 625 614 sub assoc_files {626 sub assoc_files() { 615 627 my $self = shift (@_); 616 628 my ($doc_obj, $archivedir) = @_; … … 618 630 619 631 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) { 632 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n"; 620 633 # if assoc file starts with a slash, we put it relative to the assoc 621 634 # dir, otherwise it is relative to the HASH... directory 622 635 if ($assoc_file->[1] =~ m@^[/\\]@) { 623 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);636 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]); 624 637 } else { 625 638 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]); -
trunk/gsdl/perllib/classify.pm
r12559 r12844 43 43 44 44 $next_classify_num = 1; 45 $oid_to_clids = {}; 46 45 47 sub load_classifier_for_info { 46 48 my ($classifier) = shift @_; … … 302 304 $tempinfo->{'classifyOID'} = "CL$next_classify_num" unless defined($tempinfo->{'classifyOID'}); 303 305 $next_classify_num++; 306 307 print STDERR "*** outputting information for classifier: $tempinfo->{'classifyOID'}\n"; 308 304 309 push (@{$classifyinfo->{'contains'}}, $tempinfo); 305 310 } … … 357 362 $contains_text .= $tempinfo->{'classifyOID'}; 358 363 } 364 365 # Extra code for incremental building. 366 # We need to store a listing of the classifiers each DOI is in 367 my $clids = []; 368 #rint STDERR "==1. Recording reverse lookup for " . $tempinfo->{'classifyOID'} . "==\n"; 369 if(defined($oid_to_clids->{$tempinfo->{'classifyOID'}})) { 370 #rint STDERR "Found existing array!\n"; 371 $clids = $oid_to_clids->{$tempinfo->{'classifyOID'}}; 372 } 373 #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n"; 374 push(@{$clids}, $OID); 375 $oid_to_clids->{$tempinfo->{'classifyOID'}} = $clids; 376 #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n"; 377 359 378 &print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'}, 360 379 $remove_empty_classifications); 361 380 } elsif (defined ($tempinfo->{'OID'})) { 362 381 $contains_text .= $tempinfo->{'OID'}; 363 $mdoffset_text .= $tempinfo->{'offset'} 364 if (defined ($tempinfo->{'offset'})) 365 # note: we don't want to print the contents of the books 382 $mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'})); 383 384 385 # note: we don't want to print the contents of the books 386 # Extra code for incremental building. 387 # We need to store a listing of the classifiers each DOI is in 388 my $clids = []; 389 #rint STDERR "==2. Recording reverse lookup for " . $tempinfo->{'OID'} . "==\n"; 390 if(defined($oid_to_clids->{$tempinfo->{'OID'}})) { 391 #rint STDERR "Found existing array!\n"; 392 $clids = $oid_to_clids->{$tempinfo->{'OID'}}; 393 } 394 #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n"; 395 push(@{$clids}, $OID); 396 $oid_to_clids->{$tempinfo->{'OID'}} = $clids; 397 #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n"; 398 399 366 400 } else { 367 401 … … 373 407 $contains_text .= "\".$next_subOID"; 374 408 } 409 410 # Extra code for incremental building. 411 # We need to store a listing of the classifiers each DOI is in 412 my $clids = []; 413 #rint STDERR "==3. Recording reverse lookup for $OID.$next_subOID==\n"; 414 if(defined($oid_to_clids->{$OID . "." . $next_subOID})) { 415 #rint STDERR "Found existing array!\n"; 416 $clids = $oid_to_clids->{$OID . "." . $next_subOID}; 417 } 418 #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n"; 419 push(@{$clids}, $OID); 420 $oid_to_clids->{$OID . "." . $next_subOID} = $clids; 421 #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n"; 375 422 376 423 &print_classify_info ($handle, $tempinfo, "$OID.$next_subOID", -
trunk/gsdl/perllib/lucenebuilder.pm
r11175 r12844 3 3 # lucenebuilder.pm -- perl wrapper for building index with Lucene 4 4 # A component of the Greenstone digital library software 5 # from the New Zealand Digital Library Project at the 5 # from the New Zealand Digital Library Project at the 6 6 # University of Waikato, New Zealand. 7 7 # … … 24 24 ########################################################################### 25 25 26 ########################################################################### 27 # /* 28 # * @version 1.0 ? 29 # * @version 2.0 Incremental building assistance added, including 30 # * remove_document_from_database which implements the granddad's 31 # * empty function to call the lucene_passes.pl and full_lucene_passes_exe 32 # * so there is one place in the code that works out where the 33 # * perl script is. John Rowe 34 # * 35 # * @author John Rowe, DL Consulting Ltd. 36 # */ 37 ########################################################################### 38 26 39 package lucenebuilder; 27 40 28 41 # Use same basic XML structure setup by mgppbuilder/mgppbuildproc 29 42 30 use mgppbuilder; 43 use mgppbuilder; 44 45 use IncrementalBuildUtils; 31 46 32 47 sub BEGIN { … … 34 49 } 35 50 36 51 # /** 52 # * @author John Thompson, DL Consulting Ltd. 53 # */ 37 54 sub new { 38 55 my $class = shift(@_); 39 my ($collection, $source_dir, $build_dir, $verbosity, 40 $maxdocs, $debug, $keepold, $allclassifications, 41 $outhandle, $no_text, $gli) = @_; 56 my ($collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications, $outhandle, $no_text, $faillog, $gli, $incremental) = @_; 42 57 43 58 my $self = new mgppbuilder (@_); … … 46 61 $self->{'buildtype'} = "lucene"; 47 62 63 # Do we need to put exe on the end? 64 my $exe = &util::get_os_exe (); 65 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; 66 67 # So where is lucene_passes.pl anyway? 68 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl"); 69 70 # So tack perl on the beginning to ensure execution 71 $self->{'full_lucene_passes'} = "$lucene_passes_script"; 72 if ($exe eq ".exe") 73 { 74 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\""; 75 } 76 else 77 { 78 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\""; 79 } 80 81 # We must also record whether we have been asked to do just an incremental 82 # build (which makes no difference to the Lucene indexing bit, just the 83 # building of the classifiers in the GDBM). 84 $self->{'incremental'} = $incremental; 85 48 86 return $self; 49 87 } 88 # /** new() **/ 50 89 51 90 sub default_buildproc { … … 56 95 57 96 # this writes a nice version of the text docs 58 sub compress_text {59 97 sub compress_text 98 { 60 99 my $self = shift (@_); 61 62 100 # we don't do anything if we don't want compressed text 63 101 return if $self->{'no_text'}; … … 72 110 73 111 my $osextra = ""; 74 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 75 $text_dir =~ s@/@\\@g; 76 } else { 77 if ($outhandle ne "STDERR") { 78 # so lucene_passes doesn't print to stderr if we redirect output 79 $osextra .= " 2>/dev/null"; 80 } 81 } 82 112 if ($ENV{'GSDLOS'} =~ /^windows$/i) 113 { 114 $text_dir =~ s@/@\\@g; 115 } 116 else 117 { 118 if ($outhandle ne "STDERR") 119 { 120 # so lucene_passes doesn't print to stderr if we redirect output 121 $osextra .= " 2>/dev/null"; 122 } 123 } 83 124 84 125 # get any os specific stuff 85 126 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; 86 127 87 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl"); 88 my $full_lucene_passes_exe = "\"$lucene_passes_exe\""; 89 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 90 $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\""; 91 } 128 # Find the perl script to call to run lucene 129 my $full_lucene_passes = $self->{'full_lucene_passes'}; 130 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'}; 131 92 132 my $lucene_passes_sections = "Doc"; 93 133 94 134 my ($handle); 95 135 96 if ($self->{'debug'}) { 97 $handle = STDOUT; 98 } else { 99 if (!-e "$lucene_passes_exe" || 100 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra")) { 101 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 102 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; 103 } 104 $handle = lucenebuilder::PIPEOUT; 105 } 136 if ($self->{'debug'}) 137 { 138 $handle = STDOUT; 139 } 140 else 141 { 142 print STDERR "Full Path: $full_lucene_passes\n"; 143 print STDERR "Executable: $full_lucene_passes_exe\n"; 144 print STDERR "Sections: $lucene_passes_sections\n"; 145 print STDERR "Build Dir: $build_dir\n"; 146 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n"; 147 if (!-e "$full_lucene_passes" || 148 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra")) 149 { 150 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 151 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n"; 152 } 153 $handle = lucenebuilder::PIPEOUT; 154 } 106 155 my $levels = $self->{'levels'}; 107 156 my $gdbm_level = "document"; 108 if ($levels->{'section'}) { 109 $gdbm_level = "section"; 110 } 157 if ($levels->{'section'}) 158 { 159 $gdbm_level = "section"; 160 } 111 161 112 162 undef $levels->{'paragraph'}; # get rid of para if we had it. … … 117 167 $self->{'buildproc'}->set_indexing_text (0); 118 168 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 119 $self->{'buildproc'}->set_levels ($levels); 120 $self->{'buildproc'}->set_gdbm_level ($gdbm_level); 169 $self->{'buildproc'}->set_levels ($levels); 170 $self->{'buildproc'}->set_gdbm_level ($gdbm_level); 121 171 $self->{'buildproc'}->reset(); 122 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, 123 $self->{'buildproc'}, $self->{'maxdocs'}); 124 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 125 172 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, 173 $self->{'buildproc'}, $self->{'maxdocs'}); 174 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 175 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'}); 126 176 &plugin::end($self->{'pluginfo'}); 127 177 close ($handle) unless $self->{'debug'}; … … 130 180 131 181 print STDERR "</Stage>\n" if $self->{'gli'}; 132 133 } 182 } 134 183 135 184 sub build_indexes { … … 140 189 my $indexes = []; 141 190 if (defined $indexname && $indexname =~ /\w/) { 142 191 push @$indexes, $indexname; 143 192 } else { 144 145 } 146 147 # create the mapping between the index descriptions 193 $indexes = $self->{'collect_cfg'}->{'indexes'}; 194 } 195 196 # create the mapping between the index descriptions 148 197 # and their directory names (includes subcolls and langs) 149 198 $self->{'index_mapping'} = $self->create_index_mapping ($indexes); … … 151 200 # build each of the indexes 152 201 foreach $index (@$indexes) { 153 154 155 156 157 158 159 # should probably check that new name with level 160 161 162 163 164 165 print $outhandle "\n*** building index $index at level $llevel in subdirectory " . 166 167 168 169 170 171 172 173 174 175 202 if ($self->want_built($index)) { 203 204 my $idx = $self->{'index_mapping'}->{$index}; 205 foreach my $level (keys %{$self->{'levels'}}) { 206 next if $level =~ /paragraph/; # we don't do para indexing 207 my ($pindex) = $level =~ /^(.)/; 208 # should probably check that new name with level 209 # is unique ... but currently (with doc sec and para) 210 # each has unique first letter. 211 $self->{'index_mapping'}->{$index} = $pindex.$idx; 212 213 my $llevel = $mgppbuilder::level_map{$level}; 214 print $outhandle "\n*** building index $index at level $llevel in subdirectory " . 215 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); 216 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'}; 217 218 $self->build_index($index,$llevel); 219 } 220 $self->{'index_mapping'}->{$index} = $idx; 221 222 } else { 223 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); 224 } 176 225 } 177 226 … … 179 228 $self->make_final_field_list(); 180 229 } 230 231 # /** Lucene specific document removal function. This works by calling lucene_passes.pl with 232 # * -remove and the document id on the command line. 233 # * 234 # * @param oid is the document identifier to be removed. 235 # * 236 # * @author John Rowe, DL Consulting Ltd. 237 # */ 238 sub remove_document_from_database 239 { 240 my ($self, $oid) = @_; 241 # Find the perl script to call to run lucene 242 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'}; 243 # Call lucene_passes.pl with -remove and the document ID on the command line 244 `$full_lucene_passes_exe -remove "$oid"`; 245 } 246 # /** remove_document_from_database **/ 181 247 182 248 sub build_index { … … 194 260 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; 195 261 196 my $exe = &util::get_os_exe (); 197 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl"); 198 my $full_lucene_passes_exe = "\"$lucene_passes_exe\""; 199 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 200 $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\""; 201 } 262 # Find the perl script to call to run lucene 263 my $full_lucene_passes = $self->{'full_lucene_passes'}; 264 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'}; 202 265 203 266 # define the section names for lucenepasses … … 209 272 my $osextra = ""; 210 273 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 211 274 $build_dir =~ s@/@\\@g; 212 275 } else { 213 214 215 216 217 } 218 276 if ($outhandle ne "STDERR") { 277 # so lucene_passes doesn't print to stderr if we redirect output 278 $osextra .= " 2>/dev/null"; 279 } 280 } 281 219 282 # get the index expression if this index belongs 220 283 # to a subcollection … … 222 285 my $langarr = []; 223 286 224 # there may be subcollection info, and language info. 287 # there may be subcollection info, and language info. 225 288 my ($fields, $subcollection, $language) = split (":", $index); 226 289 my @subcollections = (); … … 228 291 229 292 foreach $subcollection (@subcollections) { 230 231 232 } 233 } 234 293 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { 294 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); 295 } 296 } 297 235 298 # add expressions for languages if this index belongs to 236 # a language subcollection - only put languages expressions for the 299 # a language subcollection - only put languages expressions for the 237 300 # ones we want in the index 238 239 301 my @languages = (); 240 302 my $language_metadata = "Language"; 241 303 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) { 242 304 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'}; 243 305 } 244 306 @languages = split /,/, $language if (defined $language); 245 307 foreach my $language (@languages) { 246 247 248 249 250 251 252 253 254 308 my $not=0; 309 if ($language =~ s/^\!//) { 310 $not = 1; 311 } 312 if($not) { 313 push (@$langarr, "!$language"); 314 } else { 315 push (@$langarr, "$language"); 316 } 255 317 } 256 318 … … 261 323 262 324 if ($self->{'debug'}) { 263 325 $handle = STDOUT; 264 326 } else { 265 if (!-e "$lucene_passes_exe" || 266 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) { 267 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 268 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; 269 } 270 $handle = lucenebuilder::PIPEOUT; 271 } 272 327 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n"; 328 if (!-e "$full_lucene_passes" || 329 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) { 330 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'}; 331 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; 332 } 333 $handle = lucenebuilder::PIPEOUT; 334 } 335 273 336 my $store_levels = $self->{'levels'}; 274 337 my $gdbm_level = "document"; 275 338 if ($store_levels->{'section'}) { 276 277 } 278 339 $gdbm_level = "section"; 340 } 341 279 342 my $dom_level = ""; 280 343 foreach my $key (keys %$store_levels) { 281 282 283 344 if ($mgppbuilder::level_map{$key} eq $llevel) { 345 $dom_level = $key; 346 } 284 347 } 285 348 if ($dom_level eq "") { 286 287 349 print STDERR "Warning: unrecognized tag level $llevel\n"; 350 $dom_level = "document"; 288 351 } 289 352 … … 297 360 $self->{'buildproc'}->set_indexing_text (1); 298 361 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 299 $self->{'buildproc'}->set_levels ($local_levels); 362 $self->{'buildproc'}->set_levels ($local_levels); 300 363 $self->{'buildproc'}->set_gdbm_level($gdbm_level); 301 364 $self->{'buildproc'}->reset(); 302 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 303 365 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 366 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'}); 304 367 close ($handle) unless $self->{'debug'}; 305 368 306 369 $self->print_stats(); 307 370 308 $self->{'buildproc'}->set_levels ($store_levels); 371 $self->{'buildproc'}->set_levels ($store_levels); 309 372 print STDERR "</Stage>\n" if $self->{'gli'}; 310 } 373 } 374 375 # /** A modified version of the basebuilder.pm's function that generates the 376 # * information database (GDBM) from the GA documents. We need to change this 377 # * so that if we've been asked to do an incremental build we only add 378 # * metadata to autohierarchy classifiers via the IncrementalBuildUtils 379 # * module. All other classifiers and metadata will be ignored. 380 # */ 381 sub make_infodatabase 382 { 383 my $self = shift (@_); 384 my $outhandle = $self->{'outhandle'}; 385 386 my $dbext = ".bdb"; 387 $dbext = ".ldb" if &util::is_little_endian(); 388 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext); 389 390 # If we aren't doing an incremental addition, then we just call the super- 391 # classes version 392 # Note: Incremental addition can only occur if a text/<collection>.ldb 393 # already exists. If it doesn't, let the super classes function be 394 # called once to generate it. 395 if (!$self->{'incremental'} || !(-e $infodb_file)) 396 { 397 # basebuilder::make_infodatabase(@_); 398 # Note: this doesn't work as the direct reference means all the $self 399 # data is lost. 400 $self->basebuilder::make_infodatabase(@_); 401 return; 402 } 403 404 # Carry on with an incremental addition 405 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1); 406 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'}; 407 408 # 1. Init all the classifiers 409 &classify::init_classifiers ($self->{'classifiers'}); 410 # 2. Init the buildproc settings. 411 # Note: we still need this to process any associated files - but we 412 # don't expect to pipe anything to txt2db so we can do away with the 413 # complex output handle. 414 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); 415 &util::mk_all_dir ($assocdir); 416 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important 417 $self->{'buildproc'}->set_assocdir ($assocdir); 418 # 3. Read in all the metadata from the files in the archives directory using 419 # the GAPlug and using ourselves as the document processor! 420 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'}); 421 422 print STDERR "</Stage>\n" if $self->{'gli'}; 423 } 311 424 312 425 1; -
trunk/gsdl/perllib/lucenebuildproc.pm
r12426 r12844 31 31 # Use same basic XML structure setup by mgppbuilder/mgppbuildproc 32 32 33 use mgppbuildproc; 33 use mgppbuildproc; 34 34 use ghtml; 35 35 use strict; … … 37 37 38 38 39 use IncrementalBuildUtils; 40 39 41 sub BEGIN { 40 42 @lucenebuildproc::ISA = ('mgppbuildproc'); … … 45 47 my $class = shift @_; 46 48 my $self = new mgppbuildproc (@_); 49 50 $self->{'numincdocs'} = 0; 47 51 48 52 return bless $self, $class; … … 86 90 87 91 my $docid=""; 88 if ($ldoc_level) { 89 if ($self->{'gdbm_level'} eq 'document') { 90 my $doc_sec_num = $self->{'num_docs'}; 92 if ($ldoc_level) 93 { 94 if ($self->{'gdbm_level'} eq 'document') 95 { 96 my $doc_sec_num = $self->{'num_docs'}; 91 97 $docid = "gs2:id=\"$doc_sec_num\""; 92 } else { 98 } 99 else 100 { 93 101 # default is section level 94 my $doc_sec_num = $self->{'num_sections'} +1;102 my $doc_sec_num = $self->{'num_sections'} + 1; 95 103 $docid = "gs2:id=\"$doc_sec_num\""; 96 97 }104 } 105 } 98 106 my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n"; 99 107 my $documentendtag = "\n</$doc_level>\n"; 100 101 my ($sectiontag) = ""; 102 if ($lsec_level) { 108 109 my ($sectiontag) = ""; 110 if ($lsec_level) 111 { 103 112 $sectiontag = $mgppbuildproc::level_map{'section'}; 104 }113 } 105 114 my ($parastarttag) = ""; 106 115 my ($paraendtag) = ""; 107 if ($self->{'levels'}->{'paragraph'}) { 108 if ($self->{'strip_html'}) { 116 if ($self->{'levels'}->{'paragraph'}) 117 { 118 if ($self->{'strip_html'}) 119 { 109 120 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">"; 110 121 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">"; 111 } else { 122 } 123 else 124 { 112 125 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n"; 113 114 }115 126 } 127 } 128 116 129 my $doc_section = 0; # just for this document 117 130 … … 120 133 # get the text for this document 121 134 my $section = $doc_obj->get_top_section(); 122 while (defined $section) { 135 while (defined $section) 136 { 123 137 # update a few statistics 124 138 $doc_section++; 125 $self->{'num_sections'} += 1; 126 127 if ($sectiontag ne "") { 128 my $secid = "gs2:id=\"".$self->{'num_sections'}."\""; 139 $self->{'num_sections'}++; 140 141 if ($sectiontag ne "") 142 { 143 my $secid = "gs2:id=\"".$self->{'num_sections'}."\""; 129 144 $text .= "\n<$sectiontag $secid >\n"; 130 145 } 131 146 132 147 # if we are doing subcollections, then some docs shouldn't be indexed. 133 # but we need to put the section tag placeholders in there so the 148 # but we need to put the section tag placeholders in there so the 134 149 # sections match up with gdbm db 135 150 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section"; 136 151 if (!$indexed_doc || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) { 137 152 $text .= "\n</$sectiontag>\n" if ($sectiontag ne ""); 138 153 $section = $doc_obj->get_next_section($section); 139 154 next; 140 141 155 } 156 142 157 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 143 foreach my $field (split (/;/, $fields)) { 158 foreach my $field (split (/;/, $fields)) 159 { 144 160 # only deal with this field if it doesn't start with top or 145 161 # this is the first section … … 147 163 next if (($real_field =~ s/^top//) && ($doc_section != 1)); 148 164 149 my $new_text = ""; 150 151 # we get allfields by default - do nothing 152 if ($real_field eq "allfields") { 153 154 } 165 my $new_text = ""; 166 my $tmp_text = ""; 167 168 # If allfields is requested add all metadata fields and text as 169 # belonging to the ZZ field 170 if ($real_field eq "allfields") { 171 # Text first - no html nor paragraph tags 172 $new_text .= "$parastarttag<ZZ index=\"1\">\n"; 173 $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, ""); 174 &ghtml::htmlsafe($tmp_text); 175 $new_text .= "$tmp_text</ZZ>$paraendtag\n"; 176 # Then Metadata 177 my $metadata = $doc_obj->get_all_metadata ($section); 178 foreach my $pair (@$metadata) { 179 my ($mfield, $mvalue) = (@$pair); 180 &ghtml::htmlsafe($mvalue); 181 # check fields here, maybe others dont want - change to use dontindex!! 182 if ($mfield ne "Identifier" 183 && $mfield !~ /^gsdl/ 184 && $mfield ne "classifytype" 185 && $mfield ne "assocfilepath" 186 && defined $mvalue && $mvalue ne "") { 187 $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n"; 188 } 189 if (!defined $self->{'indexfields'}->{$mfield}) { 190 $self->{'indexfields'}->{$mfield} = 1; 191 } 192 } 193 } 155 194 # metadata - output all metadata we know about except gsdl stuff 156 elsif ($real_field eq "metadata" ) {195 elsif ($real_field eq "metadata" || $real_field eq "allfields") { 157 196 my $shortname = ""; 158 197 my $metadata = $doc_obj->get_all_metadata ($section); 159 198 foreach my $pair (@$metadata) { 160 199 my ($mfield, $mvalue) = (@$pair); 200 &ghtml::htmlsafe($mvalue); 161 201 # check fields here, maybe others dont want - change to use dontindex!! 162 202 if ($mfield ne "Identifier" … … 165 205 && $mfield ne "assocfilepath" 166 206 && defined $mvalue && $mvalue ne "") { 167 207 168 208 if (defined $self->{'indexfieldmap'}->{$mfield}) { 169 209 $shortname = $self->{'indexfieldmap'}->{$mfield}; … … 173 213 $self->{'indexfieldmap'}->{$mfield} = $shortname; 174 214 $self->{'indexfieldmap'}->{$shortname} = 1; 175 } 215 } 176 216 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n"; 177 217 if (!defined $self->{'indexfields'}->{$mfield}) { 178 218 $self->{'indexfields'}->{$mfield} = 1; 179 } 219 } 180 220 } 181 221 } 182 222 } 183 else { 223 else { 184 224 #individual metadata and or text specified - could be a comma separated list 185 225 my $shortname=""; … … 192 232 $self->{'indexfieldmap'}->{$shortname} = 1; 193 233 } 194 234 195 235 my @metadata_list = (); 196 236 foreach my $submeta (split /,/, $real_field) { … … 209 249 $new_text .= "$section_text</$shortname>$paraendtag\n"; 210 250 } 211 else { 212 # leave html stuff in, but escape the tags, and don't add Paragraph tags - never retrieve paras at the moment 213 &ghtml::htmlsafe($ section_text);214 $new_text .= $ section_text;251 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment 252 $tmp_text .= $doc_obj->get_text ($section); 253 &ghtml::htmlsafe($tmp_text); 254 $new_text .= $tmp_text; 215 255 } 216 256 } … … 229 269 } 230 270 } 231 232 271 # filter the text 233 272 $self->filter_text ($field, $new_text); 234 273 $self->{'num_processed_bytes'} += length ($new_text); 274 235 275 $text .= "$new_text"; 236 276 } # foreach field 237 277 238 278 $text .= "\n</$sectiontag>\n" if ($sectiontag ne ""); 239 279 240 280 $section = $doc_obj->get_next_section($section); 241 281 } #while defined section 242 print $handle "$text\n$documentendtag"; 282 print $handle "$text\n$documentendtag"; 243 283 #print STDOUT "$text\n$documentendtag"; 244 284 } 245 285 286 # /** We make this builder pretend to be a document processor so we can get 287 # * information back from the plugins. 288 # * 289 # * @param $self A reference to this Lucene builder 290 # * @param $doc_obj A reference to a document object representing what was 291 # * parsed by the GAPlug 292 # * @param $file The name of the file parsed as a string 293 # * 294 # * @author John Thompson, DL Consulting Ltd 295 # */ 296 sub process() 297 { 298 my $self = shift (@_); 299 my ($doc_obj, $file) = @_; 300 301 # If this is called from any stage other than an incremental infodb we want 302 # to pass through to the superclass of build 303 if ($self->get_mode() eq "incinfodb") 304 { 305 print STDERR "*** Processing a document added using INCINFODB ***\n"; 306 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/; 307 $archivedir = "" unless defined $archivedir; 308 $archivedir =~ s/\\/\//g; 309 $archivedir =~ s/^\/+//; 310 $archivedir =~ s/\/+$//; 311 312 # Number of files 313 print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n"; 314 315 # resolve the final filenames of the files associated with this document 316 $self->assoc_files ($doc_obj, $archivedir); 317 318 # is this a paged or a hierarchical document 319 my ($thistype, $childtype) = $self->get_document_type ($doc_obj); 320 321 # Determine the actual docnum by checking if we've processed any 322 # previous incrementally added documents. If so, carry on from there. 323 # Otherwise we set the counter to be the same as the number of 324 # sections encountered during the previous build 325 if ($self->{'numincdocs'} == 0) 326 { 327 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1; 328 } 329 330 my $section = $doc_obj->get_top_section (); 331 print STDERR "+ top section: '$section'\n"; 332 my $doc_OID = $doc_obj->get_OID(); 333 my $url = ""; 334 while (defined $section) 335 { 336 print STDERR "+ processing section: '$section'\n"; 337 # Attach all the other metadata to this document 338 # output the fact that this document is a document (unless doctype 339 # has been set to something else from within a plugin 340 my $dtype = $doc_obj->get_metadata_element ($section, "doctype"); 341 if (!defined $dtype || $dtype !~ /\w/) 342 { 343 $doc_obj->add_utf8_metadata($section, "doctype", $dtype); 344 } 345 # output whether this node contains text 346 if ($doc_obj->get_text_length($section) > 0) 347 { 348 $doc_obj->add_utf8_metadata($section, "hastxt", 1); 349 } 350 else 351 { 352 $doc_obj->add_utf8_metadata($section, "hastxt", 0); 353 } 354 355 # output archivedir if at top level 356 if ($section eq $doc_obj->get_top_section()) 357 { 358 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir); 359 $doc_obj->add_utf8_metadata($section, "thistype", $thistype); 360 } 361 362 # output a list of children 363 my $children = $doc_obj->get_children ($section); 364 if (scalar(@$children) > 0) 365 { 366 $doc_obj->add_utf8_metadata($section, "childtype", $childtype); 367 my @contains = (); 368 foreach my $child (@$children) 369 { 370 if ($child =~ /^.*?\.(\d+)$/) 371 { 372 push (@contains, "\".$1"); 373 } 374 else 375 { 376 push (@contains, "\".$child"); 377 } 378 } 379 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains)); 380 } 381 #output the matching doc number 382 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n"; 383 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'}); 384 385 $self->{'numincdocs'}++; 386 $section = $doc_obj->get_next_section($section); 387 # if no sections wanted, only gdbm the docs 388 last if ($self->{'gdbm_level'} eq "document"); 389 } 390 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n"; 391 &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section()); 392 } 393 else 394 { 395 $self->mgppbuildproc::process(@_); 396 } 397 } 398 # /** process() **/ 399 400 sub get_num_docs { 401 my $self = shift (@_); 402 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n"; 403 return $self->{'num_docs'}; 404 } 405 406 sub get_num_sections { 407 my $self = shift (@_); 408 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n"; 409 return $self->{'num_sections'}; 410 } 411 412 # num_bytes is the actual number of bytes in the collection 413 # this is normally the same as what's processed during text compression 414 sub get_num_bytes { 415 my $self = shift (@_); 416 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n"; 417 return $self->{'num_bytes'}; 418 } 419 246 420 1; 247 421 -
trunk/gsdl/perllib/plugins/GAPlug.pm
r12169 r12844 70 70 $self->{'metadata_value'} = ""; 71 71 $self->{'content'} = ""; 72 73 # # Currently used to store information for previous values controls. In 74 # # the next contract I'll move to using information directly from Lucene. 75 # $self->{'sqlfh'} = 0; 72 76 73 77 return bless $self, $class; … … 128 132 elsif ($element eq "Metadata") { 129 133 $self->{'doc_obj'}->add_utf8_metadata($self->{'section'}, $self->{'metadata_name'},$self->{'metadata_value'}); 134 # Ensure this value is added to the allvalues database in gseditor. 135 # Note that the database constraints prevent multiple occurances of the 136 # same key-value pair. 137 # We write these out to a file, so they can all be commited in one 138 # transaction 139 #if (!$self->{'sqlfh'}) 140 # { 141 # my $sql_file = $ENV{'GSDLHOME'} . "/collect/lld/tmp/gseditor.sql"; 142 # # If the file doesn't already exist, open it and begin a transaction 143 # my $sql_fh; 144 # if (!-e $sql_file) 145 # { 146 # open($sql_fh, ">" . $sql_file); 147 # print $sql_fh "BEGIN TRANSACTION;\n"; 148 # } 149 # else 150 # { 151 # open($sql_fh, ">>" . $sql_file); 152 # } 153 # print STDERR "Opened SQL log\n"; 154 # $self->{'sqlfh'} = $sql_fh; 155 # } 156 157 #my $mvalue = $self->{'metadata_value'}; 158 #$mvalue =~ s/\'/\'\'/g; 159 #$mvalue =~ s/_claimantsep_/ \& /g; 160 161 #my $fh = $self->{'sqlfh'}; 162 #if ($fh) 163 # { 164 # print $fh "INSERT INTO allvalues (mkey, mvalue) VALUES ('" . $self->{'metadata_name'} . "', '" . $mvalue . "');\n"; 165 # } 166 167 # Clean Up 130 168 $self->{'metadata_name'} = ""; 131 169 $self->{'metadata_value'} = ""; … … 135 173 $self->{'content'} = ""; 136 174 } 137 138 175 $self->{'element'} = ""; 139 176 } … … 161 198 sub close_document { 162 199 my $self = shift(@_); 163 200 164 201 # add the associated files 165 202 my $assoc_files = … … 170 207 171 208 my $assoc_filepath=shift (@$assoc_filepath_list); 172 if (defined ($assoc_filepath)) { 173 # make absolute rather than relative... 174 $self->{'filename'} =~ m@^(.*[\\/]archives)@; 175 $assoc_filepath = "$1/$assoc_filepath/"; 176 } else { 209 210 #rint STDERR "Filename is: " . $self->{'filename'} . "\n"; 211 #rint STDERR "Initially my assoc_filepath is: $assoc_filepath\n"; 212 #rint STDERR "Custom archive dir is: " . $self->{'base_dir'} . "\n"; 213 # Correct the assoc filepath if one is defined 214 if (defined ($assoc_filepath)) 215 { 216 # Check whether the assoc_filepath already includes the base dir 217 if (index($assoc_filepath, $self->{'base_dir'}) == -1) 218 { 219 # And if not, append it so as to make this absolute 220 $assoc_filepath = &util::filename_cat($self->{'base_dir'}, $assoc_filepath); 221 } 222 } 223 else 224 { 177 225 $assoc_filepath = $self->{'filename'}; 178 226 $assoc_filepath =~ s/[^\\\/]*$//; 179 } 227 } 228 #rint STDERR "Goned and made it absolute: $assoc_filepath\n"; 180 229 181 230 foreach my $assoc_file_info (@$assoc_files) { 182 231 my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info); 232 #rint STDERR "assoc_file: $assoc_file\n"; 233 #rint STDERR "mime_type: $mime_type\n"; 234 #rint STDERR "dir: $dir\n"; 183 235 my $real_dir = &util::filename_cat($assoc_filepath, $assoc_file), 184 236 my $assoc_dir = (defined $dir && $dir ne "") 185 237 ? &util::filename_cat($dir, $assoc_file) : $assoc_file; 186 238 $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type); 239 #rint STDERR "According to me the real assoc_filepath is: $real_dir\n"; 187 240 } 188 241 $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile"); -
trunk/gsdl/perllib/plugins/XMLPlug.pm
r12270 r12844 163 163 164 164 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 165 $self->{'base_dir'} = $base_dir; 165 166 $self->{'file'} = $file; 166 167 $self->{'filename'} = $filename;
Note:
See TracChangeset
for help on using the changeset viewer.