Changeset 12003


Ignore:
Timestamp:
2006-07-04T15:36:20+12:00 (18 years ago)
Author:
davidb
Message:

Scripts upgraded to perform more efficiently with incremental addition.

Location:
trunk/gsdl/bin/script
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/build

    r2892 r12003  
    155155    print STDOUT "   -optionfile file        Get options from file, useful on systems where\n";
    156156    print STDOUT "                           long command lines may cause problems\n";
     157    print STDOUT "   -indextype mg|mgpp|lucene \n";
     158    print STDERR "                           Specify the type of indexer used in this collection\n";
     159    print STDERR "                           If -append is used then -indextype is needed to \n";
     160    print STDERR "                           determine how to run buildcol.pl as well as update\n";
     161    print STDERR "                           'building' and 'index' according.\n";
    157162    print STDOUT "   -append                 Add new files to existing collection\n";
     163    print STDOUT "   -manifest               Use manifest.xml file to determine which files to process.\n";
    158164    print STDOUT "   -remove_archives        Remove archives directory after successfully\n";
    159165    print STDOUT "                           building the collection.\n";
     
    386392    my $import_cmd = "perl -S import.pl";
    387393    $import_cmd .= " -out \"$outfile.import\"" if $use_out;
    388     $import_cmd .= " -removeold" unless $append;
     394    if ($append) {
     395    $import_cmd .= " -keepold";
     396    } else {
     397    $import_cmd .= " -removeold";
     398    }
     399
     400    $import_cmd .= " -manifest manifest.xml" if ($manifest);
    389401    $import_cmd .= " -collectdir \"$collectdir\"" if $collectdir =~ /\w/;
    390402    $import_cmd .= " -statsfile \"$statsfile\"" if $statsfile =~ /\w/;
     
    415427
    416428    my $build_cmd = "perl -S buildcol.pl";
     429
     430    my $removeold = 1;
     431    if ($append) {
     432    if ($indextype eq "lucene") {
     433        $build_cmd .= " -keepold";
     434        $removeold = 0;
     435    }
     436    else {
     437        $build_cmd .= " -removeold";
     438    }
     439    }
     440    else {
     441    $build_cmd .= " -removeold";
     442    }
     443
    417444    $build_cmd .= " -out \"$outfile.build\"" if $use_out;
    418445    $build_cmd .= " -collectdir \"$collectdir\"" if $collectdir =~ /\w/;
     
    437464    }
    438465
    439     # replace old indexes with new ones
    440     if (&has_content ($indexdir)) {
    441     print $out "removing old indexes\n";
    442     &util::rm_r ($indexdir);
    443     }
    444     rmdir ($indexdir) if -d $indexdir;
    445     &File::Copy::move ($buildingdir, $indexdir);
     466    if ($removeold) {
     467    # replace old indexes with new ones
     468    if (&has_content ($indexdir)) {
     469        print $out "removing old indexes\n";
     470        &util::rm_r ($indexdir);
     471    }
     472    rmdir ($indexdir) if -d $indexdir;
     473    &File::Copy::move ($buildingdir, $indexdir);
     474    }
     475    else {
     476    # Do nothing.  Assume index is symbolic link to building
     477    }
    446478
    447479    # remove the cached arhives
     
    560592    if (!parsargv::parse($argref,
    561593             'optionfile/.*/', \$optionfile,
     594             'indextype/^(mg|mgpp|lucene)$/mg', \$indextype,
    562595             'append', \$append,
     596             'manifest', \$manifest,
    563597             'remove_archives', \$remove_archives,
    564598             'remove_import', \$remove_import,
  • trunk/gsdl/bin/script/import.pl

    r11746 r12003  
    4444use plugin;
    4545use docprint;
     46use manifest;
    4647use util;
    4748use scriptutil;
     
    5051use printusage;
    5152use parse2;
     53
     54
    5255
    5356use strict;
     
    9295    # parsearg left "" as default
    9396    #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
     97    'deft' => "",
     98    'reqd' => "no",
     99        'hiddengli' => "yes" },
     100      { 'name' => "manifest",
     101    'desc' => "{import.manifest}",
     102    'type' => "string",
    94103    'deft' => "",
    95104    'reqd' => "no",
     
    219228
    220229sub main {
    221     my ($verbosity, $importdir, $archivedir, $keepold,
     230    my ($verbosity, $importdir, $archivedir, $manifest, $keepold,
    222231    $removeold, $saveas, $version,
    223232    $gzip, $groupsize, $OIDtype, $debug,
     
    331340    $archivedir = $collectcfg->{'archivedir'};
    332341    }
     342    if (defined $collectcfg->{'manifest'} && $manifest eq "") {
     343    $manifest = $collectcfg->{'manifest'};
     344    }
    333345
    334346    if (defined $collectcfg->{'gzip'} && !$gzip) {
     
    403415    $archivedir =~ s/\/$//;
    404416
     417    my $manifest_lookup = new manifest();
     418    if ($manifest ne "") { 
     419    my $manifest_filename = $manifest;
     420
     421    if ($manifest_filename !~ m/^[\\\/]/) {
     422        $manifest_filename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, $manifest_filename);
     423    }
     424
     425    $manifest =~ s/[\\\/]+/\//g;
     426    $manifest =~ s/\/$//;
     427
     428    $manifest_lookup->parse($manifest_filename);
     429    }
     430
     431
    405432    # load all the plugins
    406433    $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillog, \@global_opts);
     
    445472    &plugin::begin($pluginfo, $importdir, $processor, $maxdocs, $gli);
    446473
    447     # process the import directory
    448     &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs, 0, $gli);
    449    
     474    if ($manifest eq "") {
     475    # process the import directory
     476    &plugin::read ($pluginfo, $importdir, "", {}, $processor, $maxdocs, 0, $gli);
     477    }
     478    else {
     479
     480    # process any new files
     481    foreach my $file (keys %{$manifest_lookup->{'index'}}) {
     482        &plugin::read ($pluginfo, $importdir, $file, {}, $processor, $maxdocs, 0, $gli);
     483    }
     484
     485    # record files marked for deletion in arcinfo
     486    foreach my $file (keys %{$manifest_lookup->{'delete'}}) {
     487        # consider finding it?
     488        # $archive_info->add_info($OID,$doc_xml_file,"D");
     489    }
     490    }
     491
    450492    &plugin::end($pluginfo, $processor);
    451493
  • trunk/gsdl/bin/script/lucene_passes.pl

    r10165 r12003  
    127127        } elsif ($mode eq "index") {
    128128        # notify lucene indexer
    129         # print STDERR $doc_xml;
     129
     130        # SAX parser seems to be sensitive to blank lines
     131        # => remove them
     132        $doc_xml =~ s/\n+/\n/g;
     133
     134#        print STDERR $doc_xml;
     135
    130136##      print PIPEOUT "$output_filename\n";
     137
    131138        print PIPEOUT "$doc_xml";
     139
     140
    132141        #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
    133142        }
Note: See TracChangeset for help on using the changeset viewer.