Changeset 12844


Ignore:
Timestamp:
2006-09-25T14:17:10+12:00 (18 years ago)
Author:
mdewsnip
Message:

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

Location:
trunk/gsdl
Files:
26 added
9 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/buildcol.pl

    r12821 r12844  
    11#!/usr/bin/perl -w
    22
    3 ###########################################################################
    4 #
    5 # buildcol.pl -- This program will build a particular collection
     3## @file buildcol.pl
     4# This program will build a particular collection.
    65# A component of the Greenstone digital library software
    76# from the New Zealand Digital Library Project at the
    87# University of Waikato, New Zealand.
    9 #
    10 # Copyright (C) 1999 New Zealand Digital Library Project
    118#
    129# This program is free software; you can redistribute it and/or modify
     
    2421# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
    2522#
    26 ###########################################################################
    27 
    28 # 11/04/03 Added usage datastructure - John Thompson
    29 
     23# @note 11/04/03 Added usage datastructure - John Thompson
     24#
     25# @author New Zealand Digital Library Project unless otherwise stated
     26# @copy 1999 New Zealand Digital Library Project
     27#
    3028package buildcol;
    3129
     
    194192    'type' => "flag",
    195193    'reqd' => "no",
    196     'hiddengli' => "yes" } ];
     194    'hiddengli' => "yes" },
     195      { 'name' => "incremental",
     196    'desc' => "{buildcol.incremental}",
     197    'type' => "flag",
     198    'reqd' => "no",
     199        'modegli' => "3" } ];
    197200
    198201my $options = { 'name' => "buildcol.pl",
     
    206209my $out;
    207210
    208 sub gsprintf
     211## @method gsprintf()
     212#  Print a string to the screen after looking it up from a locale dependant
     213#  strings file. This function is losely based on the idea of resource
     214#  bundles as used in Java.
     215#
     216#  @param  $error The STDERR stream.
     217#  @param  $text The string containing GS keys that should be replaced with
     218#                their locale dependant equivilents.
     219#  @param  $out The output stream.
     220#  @return The locale-based string to output.
     221#
     222sub gsprintf()
    209223{
    210224    return &gsprintf::gsprintf(@_);
    211225}
    212 
    213 
     226## gsprintf() ##
    214227
    215228&main();
    216229
     230## @method main()
     231#
     232#  [Parses up and validates the arguments to the build process before creating
     233#  the appropriate build process to do the actual work - John]
     234#
     235#  @note Added true incremental support - John Thompson, DL Consulting Ltd.
     236#  @note There were several bugs regarding using directories other than
     237#        "import" or "archives" during import and build quashed. - John
     238#        Thompson, DL Consulting Ltd.
     239#
     240#  @param  $incremental If true indicates this build should not regenerate all
     241#                       the index and metadata files, and should instead just
     242#                       append the information found in the archives directory
     243#                       to the existing files. If this requires some complex
     244#                       work so as to correctly insert into a classifier so be
     245#                       it. Of course none of this is done here - instead the
     246#                       incremental argument is passed to the document
     247#                       processor.
     248#
    217249sub main
    218250{
     
    222254    $create_images, $collectdir, $build, $type, $textindex,
    223255    $no_strip_html, $no_text, $faillog, $gli, $index, $language,
    224     $sections_index_document_metadata, $maxnumeric);
     256    $sections_index_document_metadata, $maxnumeric, $incremental);
    225257
    226258    my $xml = 0;
     
    296328
    297329    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
     330    # Don't know why this didn't already happen, but now collection specific
     331    # classify and plugins directory also added to include path
     332    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/classify");
     333    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins");
    298334
    299335    # read the configuration file
     
    398434    $gli = 0 unless defined $gli;
    399435
     436    # New argument to track whether build is incremental
     437    $incremental = 0 unless defined $incremental;
     438
    400439    print STDERR "<Build>\n" if $gli;
    401440
     
    421460    # were supplied, turn all \ into / and remove trailing /
    422461
    423     my ($realarchivedir, $realbuilddir); 
    424     $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
     462    my ($realarchivedir, $realbuilddir);
     463    # Modified so that the archivedir, if provided as an argument, is made
     464    # absolute if it isn't already
     465    if ($archivedir eq "")
     466      {
     467        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
     468      }
     469    else
     470      {
     471        $archivedir = &make_absolute($ENV{'GSDLCOLLECTDIR'}, $archivedir);
     472      }
     473    # End Mod
    425474    $archivedir =~ s/[\\\/]+/\//g;
    426475    $archivedir =~ s/\/$//;
     
    476525     "\$realarchivedir, \$realbuilddir, \$verbosity, " .
    477526     "\$maxdocs, \$debug, \$keepold, \$remove_empty_classifications, " .
    478      "\$out, \$no_text, \$faillog, \$gli)");
     527     "\$out, \$no_text, \$faillog, \$gli, \$incremental)");
    479528    die "$@" if $@;
    480529
     
    519568    print STDERR "</Build>\n" if $gli;
    520569}
    521 
     570## main() ##
     571
     572## @method create_images()
     573#
     574#  [Used to create default cover images... from what I'm not quite sure - John]
     575#
    522576sub create_images {
    523577    my ($collection_name) = @_;
     
    571625    close CFGFILE;
    572626}
     627## create_images() ##
     628
     629## @method make_absolute()
     630#
     631#  Ensure the given file path is absolute in respect to the given base path.
     632#
     633#  @param  $base_dir A string denoting the base path the given dir must be
     634#                    abolsute to.
     635#  @param  $dir The directory to be made absolute as a string. Note that the
     636#               dir may already be absolute, in which case it will remain
     637#               unchanged.
     638#  @return The now absolute form of the directory as a string.
     639#
     640#  @author John Thompson, DL Consulting Ltd.
     641#  @copy 2006 DL Consulting Ltd.
     642#
     643sub make_absolute()
     644  {
     645    my ($base_dir, $dir) = @_;
     646    $dir = $base_dir . "/$dir" if ($dir =~ m#^[^/]#);
     647    $dir =~ s|^/tmp_mnt||;
     648    1 while($dir =~ s|/[^/]*/\.\./|/|g);
     649    $dir =~ s|/[.][.]?/|/|g;
     650    $dir =~ tr|/|/|s;
     651    return $dir;
     652  }
     653## make_absolute() ##
  • trunk/gsdl/bin/script/lucene_passes.pl

    r12258 r12844  
    55# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
    66# A component of the Greenstone digital library software
    7 # from the New Zealand Digital Library Project at the 
     7# from the New Zealand Digital Library Project at the
    88# University of Waikato, New Zealand.
    99#
     
    3939use ghtml;
    4040
    41 
    4241sub open_java_lucene
    4342{
    44     my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_;
    45 
    46     my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
    47     my $classpath = &util::filename_cat($bin_java,"LuceneWrap.jar");
    48 
    49     my $java_lucene = "java -classpath \"$classpath\" org.nzdl.gsdl.LuceneWrap.GS2LuceneIndexer";
    50     my $cmd_options = "$create -verbosity $verbosity";
    51     my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir";
    52 
    53     if (!open (PIPEOUT, "| $java_cmd")) {
    54     die "$PROGNAME - couldn't run $java_cmd\n";
    55     }
     43  my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_;
     44
     45  my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
     46  my $classpath = &util::filename_cat($bin_java,"LuceneWrap.jar");
     47
     48  my $java_lucene = "java -classpath \"$classpath\" org.nzdl.gsdl.LuceneWrap.GS2LuceneIndexer";
     49  my $cmd_options = "$create -verbosity $verbosity";
     50  my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir";
     51
     52  open (PIPEOUT, "| $java_cmd") or die "$PROGNAME - couldn't run $java_cmd\n";
    5653}
    5754
    5855sub close_java_lucene
    5956{
    60     close(PIPEOUT);
     57  close(PIPEOUT);
    6158}
    6259
     
    6663    $dir_sep = &util::get_os_dirsep();
    6764
    68     my $full_output_filename
    69     = &util::filename_cat($full_textdir,$output_filename);
     65    my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
    7066    my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
    7167    &util::mk_all_dir($full_output_dir);
    72    
    73     open(DOCOUT,">$full_output_filename") 
     68
     69    open(DOCOUT,">$full_output_filename")
    7470    || die "Unable to open $full_output_filename";
    7571
     
    7874
    7975    my @secs =  ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
    80 
    81 
    82 # Currently not used, but consult with DB before removing
    83 #    foreach my $sec (@secs) {
    84 #   my ($docnum,$sec_text) = ($sec =~ m/<Sec\s+gs2:id="(\d+)"\s*>(.*?)<\/Sec>/s);
    85 #   my $docnum_filename
    86 #       = &util::filename_cat($full_textdir,"$docnum.xml");
    87 
    88 #   
    89 #   open(SECOUT,">$docnum_filename")
    90 #       || die "Unable to open $docnum_filename";
    91 
    92 #   print SECOUT &ghtml::unescape_html($sec_text);
    93 #   close(SECOUT);
    94 #    }
    95 
    9676}
    9777
     
    10080    my ($full_textdir,$output_filename) = @_;
    10181
    102     my $full_output_filename 
     82    my $full_output_filename
    10383    = &util::filename_cat($full_textdir,$output_filename);
    10484
     
    10686}
    10787
     88# This appears to be the callback that gets the xml stream during the
     89# build process, so I need to intercept it here and call my XML RPC
     90# to insert into the Lucene database.
    10891sub monitor_xml_stream
    10992{
     
    150133}
    151134
     135
     136# /** This checks the arguments on the command line, filters the
     137#  *  unknown command line arguments and then calls the open_java_lucene
     138#  *  function to begin processing. Most of the arguments are passed on
     139#  *  the command line of the java wrapper.
     140#  *
     141#  *  Do not set -create and -remove at the same time, although -create is
     142#  *  required for -remove, -remove will set it it's self, if you set -create
     143#  *  after -remove the create will be ignored.
     144#  *
     145#  *  @version 2.0 Added support for removing documents from the index by John Rowe
     146#  *
     147#  *  @author John Rowe, DL Consulting
     148#  */
    152149sub main
    153150{
    154     my (@argv) = @_;
    155     my $argc = scalar(@argv);
    156 
    157     my $create = "";
    158     my $verbosity = 1;
    159 
    160     my @filtered_argv = ();
    161 
    162     my $i = 0;
    163     while ($i<$argc) {
    164     if ($argv[$i] =~ m/^-(.*)$/) {
    165 
    166         my $option = $1;
    167 
    168         # -create causes build to be incremental
    169         if ($option eq ("create")) {
    170         $create = "-create";
    171         }
    172 
    173         # -verbosity num
    174         elsif ($option eq "verbosity") {
    175         $i++;
    176         if ($i<$argc) {
    177             $verbosity = $argv[$i];
    178         }
    179         }
    180         else {
    181         print STDERR "Unrecognised minus option: -$option\n";
    182         }
    183     }
    184     else {
    185         push(@filtered_argv,$argv[$i]);
    186     }
    187     $i++;
     151  my (@argv) = @_;
     152  my $argc = scalar(@argv);
     153
     154  my $create = "";
     155  my $verbosity = 1;
     156
     157  my @filtered_argv = ();
     158
     159  my $i = 0;
     160  while ($i<$argc) {
     161    if ($argv[$i] =~ m/^\-(.*)$/) {
     162
     163      my $option = $1;
     164
     165      # -create causes build to be incremental
     166      if ($option eq ("create")) {
     167        print STDERR "\n\n-create set\n";
     168        $create = "-create";
     169      }
     170      # In a blinding flash of unintuitiveness -remove causes
     171      # -create to be set (we don't want to remove the old indexes)
     172      elsif($option eq "remove")
     173      {
     174        # Look at the next arg for the oid and if that doesn't exist then
     175        $i++;
     176        if(!defined $argv[$i])
     177        {
     178          print STDERR "Remove was specified but the OID was not specified";
     179          die "\n\nCannot continue";
     180        }
     181        $removeoid = $argv[$i];
     182        print STDERR "\n\nWe're removing the document with id: '$removeoid'\n";
     183
     184        # Now, to make sure this gets through to the Java executable
     185        $create = "-create -remove '$removeoid'";
     186      }
     187
     188      # -verbosity num
     189      elsif ($option eq "verbosity") {
     190        $i++;
     191        if ($i<$argc) {
     192          $verbosity = $argv[$i];
     193        }
     194      }
     195      else {
     196        print STDERR "Unrecognised minus option: -$option\n";
     197      }
    188198    }
    189 
    190     my $filtered_argc = scalar(@filtered_argv);
    191 
    192     if ($filtered_argc < 4) {
    193     print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
    194     exit 1;
     199    else {
     200        push(@filtered_argv,$argv[$i]);
    195201    }
    196 
    197     my $mode = $filtered_argv[0];
    198     my $doc_tag_level = $filtered_argv[1];
    199     my $full_builddir = $filtered_argv[2];
    200     my $indexdir      = $filtered_argv[3];
     202    $i++;
     203  }
     204
     205  my $filtered_argc = scalar(@filtered_argv);
     206
     207  if ($filtered_argc < 4) {
     208    print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
     209    exit 1;
     210  }
     211
     212  my $mode = $filtered_argv[0];
     213  my $doc_tag_level = $filtered_argv[1];
     214  my $full_builddir = $filtered_argv[2];
     215  my $indexdir      = $filtered_argv[3];
    201216###    print STDERR "**** ARGS = ", join(" ", @argv), "\n";
    202217
    203     my $full_textdir = &util::filename_cat($full_builddir,"text");
    204 
    205     if ($mode eq "index") {
    206     # don't need the lucene stuff if we are just storing the docs
    207     open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity);
    208     }
    209     monitor_xml_stream($mode, $full_textdir);
    210     if ($mode eq "index") {
    211     close_java_lucene();
    212     }
     218  my $full_textdir = &util::filename_cat($full_builddir,"text");
     219
     220  if ($mode eq "index") {
     221# don't need the lucene stuff if we are just storing the docs
     222    open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity);
     223  }
     224  print STDERR "Monitoring for input!\n";
     225  monitor_xml_stream($mode, $full_textdir);
     226  if ($mode eq "index") {
     227    close_java_lucene();
     228  }
    213229}
    214230
  • trunk/gsdl/perllib/basebuilder.pm

    r12340 r12844  
    287287    my $outhandle = $self->{'outhandle'};
    288288
     289    print STDERR "BuildDir: $self->{'build_dir'}\n";
     290
    289291    my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
    290292    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
     
    361363                     $self->{'gli'});
    362364
     365    # Output classifier reverse lookup, used in incremental deletion
     366    #&classify::print_reverse_lookup($handle);
    363367
    364368    #output doclist
     
    401405    # store the number of documents and number of bytes
    402406    $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
     407    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
    403408    $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
    404 
    405409   
    406410    # store the mapping between the index names and the directory names
  • trunk/gsdl/perllib/basebuildproc.pm

    r11994 r12844  
    4040}
    4141
    42 sub new {
    43     my ($class, $collection, $source_dir, $build_dir, $keepold,
    44     $verbosity, $outhandle) = @_;
     42sub new()
     43  {
     44    my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
    4545    my $self = new docproc ();
    4646
     
    7171    # For incremental building need to seed num_docs etc from values
    7272    # stored in build.cfg (if present)
    73 
     73      print STDERR "Keepold!\n";
    7474    $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
    75 
     75      print STDERR "Build cfg: $buildconfigfile\n";
    7676    if (-e $buildconfigfile) {
    7777        $found_num_data = 1;
     
    8181        $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
    8282                           "index", "build.cfg");
     83            print STDERR "Index cfg: $buildconfigfile\n";
    8384        if (-e $buildconfigfile) {
    8485        $found_num_data = 1;
     
    8687    }
    8788
    88     }
    89 
    90     if ($found_num_data) {
     89    }
     90    #else
     91    #  {
     92    #    print STDERR "Removeold!\n";
     93    #  }
     94
     95    if ($found_num_data)
     96      {
     97        #print STDERR "Found_Num_Data!\n";
    9198    my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
    92    
    9399    $self->{'starting_num_docs'}     = $buildcfg->{'numdocs'};
     100        #print STDERR "- num_docs:     $self->{'starting_num_docs'}\n";
    94101    $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
     102        #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
    95103    $self->{'starting_num_bytes'}    = $buildcfg->{'numbytes'};
    96     }
    97     else {
    98     $self->{'starting_num_docs'}     = 0;
     104        #print STDERR "- num_bytes:    $self->{'starting_num_bytes'}\n";
     105    }
     106    else
     107      {
     108        #print STDERR "NOT Found_Num_Data!\n";
     109        $self->{'starting_num_docs'}     = 0;
    99110    $self->{'starting_num_sections'} = 0;
    100111    $self->{'starting_num_bytes'}    = 0;
    101     }
     112      }
    102113
    103114    $self->{'output_handle'} = "STDOUT";
     
    428439        # output the matching document number
    429440        print $handle "<docnum>$self->{'num_docs'}\n";
     441           
    430442    } else {
    431443        # output a list of children
     
    612624}
    613625
    614 sub assoc_files {
     626sub assoc_files() {
    615627    my $self = shift (@_);
    616628    my ($doc_obj, $archivedir) = @_;
     
    618630   
    619631    foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
     632      #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
    620633    # if assoc file starts with a slash, we put it relative to the assoc
    621634    # dir, otherwise it is relative to the HASH... directory
    622635    if ($assoc_file->[1] =~ m@^[/\\]@) {
    623         $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
     636        $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
    624637    } else {
    625638        $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
  • trunk/gsdl/perllib/classify.pm

    r12559 r12844  
    4343
    4444$next_classify_num = 1;
     45$oid_to_clids = {};
     46
    4547sub load_classifier_for_info {
    4648    my ($classifier) = shift @_;
     
    302304    $tempinfo->{'classifyOID'} = "CL$next_classify_num" unless defined($tempinfo->{'classifyOID'});
    303305    $next_classify_num++;
     306
     307        print STDERR "*** outputting information for classifier: $tempinfo->{'classifyOID'}\n";
     308
    304309    push (@{$classifyinfo->{'contains'}}, $tempinfo);
    305310    }
     
    357362        $contains_text .= $tempinfo->{'classifyOID'};
    358363        }
     364
     365            # Extra code for incremental building.
     366            # We need to store a listing of the classifiers each DOI is in
     367            my $clids = [];
     368            #rint STDERR "==1. Recording reverse lookup for " . $tempinfo->{'classifyOID'} . "==\n";
     369            if(defined($oid_to_clids->{$tempinfo->{'classifyOID'}})) {
     370              #rint STDERR "Found existing array!\n";
     371              $clids = $oid_to_clids->{$tempinfo->{'classifyOID'}};
     372            }
     373            #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
     374            push(@{$clids}, $OID);
     375            $oid_to_clids->{$tempinfo->{'classifyOID'}} = $clids;
     376            #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
     377           
    359378        &print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'},
    360379                  $remove_empty_classifications);
    361380    } elsif (defined ($tempinfo->{'OID'})) {
    362381        $contains_text .= $tempinfo->{'OID'};
    363         $mdoffset_text .= $tempinfo->{'offset'}
    364         if (defined ($tempinfo->{'offset'}))
    365         # note: we don't want to print the contents of the books
     382        $mdoffset_text .= $tempinfo->{'offset'} if (defined ($tempinfo->{'offset'}));
     383
     384           
     385              # note: we don't want to print the contents of the books
     386              # Extra code for incremental building.
     387              # We need to store a listing of the classifiers each DOI is in
     388              my $clids = [];
     389              #rint STDERR "==2. Recording reverse lookup for " . $tempinfo->{'OID'} . "==\n";
     390              if(defined($oid_to_clids->{$tempinfo->{'OID'}})) {
     391                #rint STDERR "Found existing array!\n";
     392                $clids = $oid_to_clids->{$tempinfo->{'OID'}};
     393              }
     394              #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
     395              push(@{$clids}, $OID);
     396              $oid_to_clids->{$tempinfo->{'OID'}} = $clids;
     397              #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
     398
     399
    366400        } else {
    367401       
     
    373407            $contains_text .= "\".$next_subOID";
    374408        }
     409
     410                # Extra code for incremental building.
     411                # We need to store a listing of the classifiers each DOI is in
     412                my $clids = [];
     413                #rint STDERR "==3. Recording reverse lookup for $OID.$next_subOID==\n";
     414                if(defined($oid_to_clids->{$OID . "." . $next_subOID})) {
     415                  #rint STDERR "Found existing array!\n";
     416                  $clids = $oid_to_clids->{$OID . "." . $next_subOID};
     417                }
     418                #rint STDERR "Appended $OID to \"" . join(";", @{$clids}) . "\"\n";
     419                push(@{$clids}, $OID);
     420                $oid_to_clids->{$OID . "." . $next_subOID} = $clids;
     421                #rint STDERR "Result: \"" . join(";", @{$clids}) . "\"\n";
    375422       
    376423        &print_classify_info ($handle, $tempinfo, "$OID.$next_subOID",
  • trunk/gsdl/perllib/lucenebuilder.pm

    r11175 r12844  
    33# lucenebuilder.pm -- perl wrapper for building index with Lucene
    44# A component of the Greenstone digital library software
    5 # from the New Zealand Digital Library Project at the 
     5# from the New Zealand Digital Library Project at the
    66# University of Waikato, New Zealand.
    77#
     
    2424###########################################################################
    2525
     26###########################################################################
     27# /*
     28#  *  @version 1.0 ?
     29#  *  @version 2.0 Incremental building assistance added, including
     30#  *               remove_document_from_database which implements the granddad's
     31#  *               empty function to call the lucene_passes.pl and full_lucene_passes_exe
     32#  *               so there is one place in the code that works out where the
     33#  *               perl script is. John Rowe
     34#  *
     35#  *  @author John Rowe, DL Consulting Ltd.
     36#  */
     37###########################################################################
     38
    2639package lucenebuilder;
    2740
    2841# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
    2942
    30 use mgppbuilder;
     43use mgppbuilder;
     44
     45use IncrementalBuildUtils;
    3146
    3247sub BEGIN {
     
    3449}
    3550
    36 
     51# /**
     52#  *  @author  John Thompson, DL Consulting Ltd.
     53#  */
    3754sub new {
    3855    my $class = shift(@_);
    39     my ($collection, $source_dir, $build_dir, $verbosity,
    40     $maxdocs, $debug, $keepold, $allclassifications,
    41     $outhandle, $no_text, $gli) = @_;
     56    my ($collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications, $outhandle, $no_text, $faillog, $gli, $incremental) = @_;
    4257
    4358    my $self = new mgppbuilder (@_);
     
    4661    $self->{'buildtype'} = "lucene";
    4762
     63    # Do we need to put exe on the end?
     64    my $exe = &util::get_os_exe ();
     65    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
     66
     67    # So where is lucene_passes.pl anyway?
     68    my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
     69
     70    # So tack perl on the beginning to ensure execution
     71    $self->{'full_lucene_passes'} = "$lucene_passes_script";
     72    if ($exe eq ".exe")
     73    {
     74      $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
     75    }
     76    else
     77    {
     78      $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
     79    }
     80
     81    # We must also record whether we have been asked to do just an incremental
     82    # build (which makes no difference to the Lucene indexing bit, just the
     83    # building of the classifiers in the GDBM).
     84    $self->{'incremental'} = $incremental;
     85
    4886    return $self;
    4987}
     88# /** new() **/
    5089
    5190sub default_buildproc {
     
    5695
    5796# this writes a nice version of the text docs
    58 sub compress_text {
    59 
     97sub compress_text
     98  {
    6099    my $self = shift (@_);
    61    
    62100    # we don't do anything if we don't want compressed text
    63101    return if $self->{'no_text'};
     
    72110
    73111    my $osextra = "";
    74     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    75     $text_dir =~ s@/@\\@g;
    76     } else {
    77     if ($outhandle ne "STDERR") {
    78         # so lucene_passes doesn't print to stderr if we redirect output
    79         $osextra .= " 2>/dev/null";
    80     }
    81     }
    82    
     112    if ($ENV{'GSDLOS'} =~ /^windows$/i)
     113      {
     114    $text_dir =~ s@/@\\@g;
     115      }
     116    else
     117      {
     118    if ($outhandle ne "STDERR")
     119          {
     120        # so lucene_passes doesn't print to stderr if we redirect output
     121        $osextra .= " 2>/dev/null";
     122          }
     123      }
    83124
    84125    # get any os specific stuff
    85126    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
    86127
    87     my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
    88     my $full_lucene_passes_exe = "\"$lucene_passes_exe\"";
    89     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    90     $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\"";
    91     }
     128    # Find the perl script to call to run lucene
     129    my $full_lucene_passes = $self->{'full_lucene_passes'};
     130    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
     131
    92132    my $lucene_passes_sections = "Doc";
    93133
    94134    my ($handle);
    95135
    96     if ($self->{'debug'}) {
    97     $handle = STDOUT;
    98     } else {
    99     if (!-e "$lucene_passes_exe" ||
    100         !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra")) {
    101         print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
    102         die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
    103     }
    104     $handle = lucenebuilder::PIPEOUT;
    105     }
     136    if ($self->{'debug'})
     137      {
     138    $handle = STDOUT;
     139      }
     140    else
     141      {
     142        print STDERR "Full Path:     $full_lucene_passes\n";
     143        print STDERR "Executable:    $full_lucene_passes_exe\n";
     144        print STDERR "Sections:      $lucene_passes_sections\n";
     145        print STDERR "Build Dir:     $build_dir\n";
     146        print STDERR "Cmd:           $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra\n";
     147    if (!-e "$full_lucene_passes" ||
     148        !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra"))
     149          {
     150        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
     151        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
     152          }
     153    $handle = lucenebuilder::PIPEOUT;
     154      }
    106155    my $levels = $self->{'levels'};
    107156    my $gdbm_level = "document";
    108     if ($levels->{'section'}) {
    109     $gdbm_level = "section";
    110     }
     157    if ($levels->{'section'})
     158      {
     159    $gdbm_level = "section";
     160      }
    111161
    112162    undef $levels->{'paragraph'}; # get rid of para if we had it.
     
    117167    $self->{'buildproc'}->set_indexing_text (0);
    118168    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    119     $self->{'buildproc'}->set_levels ($levels);                       
    120     $self->{'buildproc'}->set_gdbm_level ($gdbm_level);                       
     169    $self->{'buildproc'}->set_levels ($levels);
     170    $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
    121171    $self->{'buildproc'}->reset();
    122     &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, 
    123            $self->{'buildproc'}, $self->{'maxdocs'});   
    124     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
    125            "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
     172    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
     173           $self->{'buildproc'}, $self->{'maxdocs'});
     174    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
     175           "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
    126176    &plugin::end($self->{'pluginfo'});
    127177    close ($handle) unless $self->{'debug'};
     
    130180
    131181    print STDERR "</Stage>\n" if $self->{'gli'};
    132 
    133 }
     182  }
    134183
    135184sub build_indexes {
     
    140189    my $indexes = [];
    141190    if (defined $indexname && $indexname =~ /\w/) {
    142     push @$indexes, $indexname;
     191    push @$indexes, $indexname;
    143192    } else {
    144     $indexes = $self->{'collect_cfg'}->{'indexes'};
    145     }
    146 
    147     # create the mapping between the index descriptions 
     193    $indexes = $self->{'collect_cfg'}->{'indexes'};
     194    }
     195
     196    # create the mapping between the index descriptions
    148197    # and their directory names (includes subcolls and langs)
    149198    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
     
    151200    # build each of the indexes
    152201    foreach $index (@$indexes) {
    153     if ($self->want_built($index)) {
    154 
    155         my $idx = $self->{'index_mapping'}->{$index};
    156         foreach my $level (keys %{$self->{'levels'}}) {
    157         next if $level =~ /paragraph/; # we don't do para indexing
    158         my ($pindex) = $level =~ /^(.)/;
    159         # should probably check that new name with level
    160         # is unique ... but currently (with doc sec and para)
    161         # each has unique first letter.
    162         $self->{'index_mapping'}->{$index} = $pindex.$idx;
    163 
    164         my $llevel = $mgppbuilder::level_map{$level};
    165         print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
    166             "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
    167         print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
    168 
    169         $self->build_index($index,$llevel);
    170         }
    171         $self->{'index_mapping'}->{$index} = $idx;
    172 
    173     } else {
    174         print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
    175     }
     202    if ($self->want_built($index)) {
     203
     204        my $idx = $self->{'index_mapping'}->{$index};
     205        foreach my $level (keys %{$self->{'levels'}}) {
     206        next if $level =~ /paragraph/; # we don't do para indexing
     207        my ($pindex) = $level =~ /^(.)/;
     208        # should probably check that new name with level
     209        # is unique ... but currently (with doc sec and para)
     210        # each has unique first letter.
     211        $self->{'index_mapping'}->{$index} = $pindex.$idx;
     212
     213        my $llevel = $mgppbuilder::level_map{$level};
     214        print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
     215            "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
     216        print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
     217
     218        $self->build_index($index,$llevel);
     219        }
     220        $self->{'index_mapping'}->{$index} = $idx;
     221
     222    } else {
     223        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
     224    }
    176225    }
    177226
     
    179228    $self->make_final_field_list();
    180229}
     230
     231# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
     232#  *  -remove and the document id on the command line.
     233#  *
     234#  *  @param oid is the document identifier to be removed.
     235#  *
     236#  *  @author John Rowe, DL Consulting Ltd.
     237#  */
     238sub remove_document_from_database
     239  {
     240    my ($self, $oid) = @_;
     241    # Find the perl script to call to run lucene
     242    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
     243    # Call lucene_passes.pl with -remove and the document ID on the command line
     244    `$full_lucene_passes_exe -remove "$oid"`;
     245  }
     246# /** remove_document_from_database **/
    181247
    182248sub build_index {
     
    194260    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
    195261
    196     my $exe = &util::get_os_exe ();
    197     my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
    198     my $full_lucene_passes_exe = "\"$lucene_passes_exe\"";
    199     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    200     $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\"";
    201     }
     262    # Find the perl script to call to run lucene
     263    my $full_lucene_passes = $self->{'full_lucene_passes'};
     264    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
    202265
    203266    # define the section names for lucenepasses
     
    209272    my $osextra = "";
    210273    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    211     $build_dir =~ s@/@\\@g;
     274    $build_dir =~ s@/@\\@g;
    212275    } else {
    213     if ($outhandle ne "STDERR") {
    214         # so lucene_passes doesn't print to stderr if we redirect output
    215         $osextra .= " 2>/dev/null";
    216     }
    217     }
    218  
     276    if ($outhandle ne "STDERR") {
     277        # so lucene_passes doesn't print to stderr if we redirect output
     278        $osextra .= " 2>/dev/null";
     279    }
     280    }
     281
    219282    # get the index expression if this index belongs
    220283    # to a subcollection
     
    222285    my $langarr = [];
    223286
    224     # there may be subcollection info, and language info. 
     287    # there may be subcollection info, and language info.
    225288    my ($fields, $subcollection, $language) = split (":", $index);
    226289    my @subcollections = ();
     
    228291
    229292    foreach $subcollection (@subcollections) {
    230     if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
    231         push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
    232     }
    233     }
    234    
     293    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
     294        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
     295    }
     296    }
     297
    235298    # add expressions for languages if this index belongs to
    236     # a language subcollection - only put languages expressions for the 
     299    # a language subcollection - only put languages expressions for the
    237300    # ones we want in the index
    238    
    239301    my @languages = ();
    240302    my $language_metadata = "Language";
    241303    if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
    242     $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
     304    $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
    243305    }
    244306    @languages = split /,/, $language if (defined $language);
    245307    foreach my $language (@languages) {
    246     my $not=0;
    247     if ($language =~ s/^\!//) {
    248         $not = 1;
    249     }
    250     if($not) {
    251         push (@$langarr, "!$language");
    252     } else {
    253         push (@$langarr, "$language");
    254     }
     308    my $not=0;
     309    if ($language =~ s/^\!//) {
     310        $not = 1;
     311    }
     312    if($not) {
     313        push (@$langarr, "!$language");
     314    } else {
     315        push (@$langarr, "$language");
     316    }
    255317    }
    256318
     
    261323
    262324    if ($self->{'debug'}) {
    263     $handle = STDOUT;
     325    $handle = STDOUT;
    264326    } else {
    265     if (!-e "$lucene_passes_exe" ||
    266         !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
    267         print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
    268         die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
    269     }
    270     $handle = lucenebuilder::PIPEOUT;
    271     }
    272    
     327      print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
     328    if (!-e "$full_lucene_passes" ||
     329        !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
     330        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
     331        die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
     332    }
     333    $handle = lucenebuilder::PIPEOUT;
     334    }
     335
    273336    my $store_levels = $self->{'levels'};
    274337    my $gdbm_level = "document";
    275338    if ($store_levels->{'section'}) {
    276     $gdbm_level = "section";
    277     }
    278    
     339    $gdbm_level = "section";
     340    }
     341
    279342    my $dom_level = "";
    280343    foreach my $key (keys %$store_levels) {
    281     if ($mgppbuilder::level_map{$key} eq $llevel) {
    282         $dom_level = $key;
    283     }
     344    if ($mgppbuilder::level_map{$key} eq $llevel) {
     345        $dom_level = $key;
     346    }
    284347    }
    285348    if ($dom_level eq "") {
    286     print STDERR "Warning: unrecognized tag level $llevel\n";
    287     $dom_level = "document";
     349    print STDERR "Warning: unrecognized tag level $llevel\n";
     350    $dom_level = "document";
    288351    }
    289352
     
    297360    $self->{'buildproc'}->set_indexing_text (1);
    298361    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    299     $self->{'buildproc'}->set_levels ($local_levels);                       
     362    $self->{'buildproc'}->set_levels ($local_levels);
    300363    $self->{'buildproc'}->set_gdbm_level($gdbm_level);
    301364    $self->{'buildproc'}->reset();
    302     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 
    303            "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
     365    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
     366           "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
    304367    close ($handle) unless $self->{'debug'};
    305368
    306369    $self->print_stats();
    307370
    308     $self->{'buildproc'}->set_levels ($store_levels);                       
     371    $self->{'buildproc'}->set_levels ($store_levels);
    309372    print STDERR "</Stage>\n" if $self->{'gli'};
    310 }   
     373}
     374
     375# /** A modified version of the basebuilder.pm's function that generates the
     376#  *  information database (GDBM) from the GA documents. We need to change this
     377#  *  so that if we've been asked to do an incremental build we only add
     378#  *  metadata to autohierarchy classifiers via the IncrementalBuildUtils
     379#  *  module. All other classifiers and metadata will be ignored.
     380#  */
     381sub make_infodatabase
     382  {
     383    my $self = shift (@_);
     384    my $outhandle = $self->{'outhandle'};
     385
     386    my $dbext = ".bdb";
     387    $dbext = ".ldb" if &util::is_little_endian();
     388    my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext);
     389
     390    # If we aren't doing an incremental addition, then we just call the super-
     391    # classes version
     392    # Note: Incremental addition can only occur if a text/<collection>.ldb
     393    #       already exists. If it doesn't, let the super classes function be
     394    #       called once to generate it.
     395    if (!$self->{'incremental'} || !(-e $infodb_file))
     396      {
     397        # basebuilder::make_infodatabase(@_);
     398        # Note: this doesn't work as the direct reference means all the $self
     399        #       data is lost.
     400        $self->basebuilder::make_infodatabase(@_);
     401        return;
     402      }
     403
     404    # Carry on with an incremental addition
     405    print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
     406    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
     407
     408    # 1. Init all the classifiers
     409    &classify::init_classifiers ($self->{'classifiers'});
     410    # 2. Init the buildproc settings.
     411    #    Note: we still need this to process any associated files - but we
     412    #    don't expect to pipe anything to txt2db so we can do away with the
     413    #    complex output handle.
     414    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
     415    &util::mk_all_dir ($assocdir);
     416    $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
     417    $self->{'buildproc'}->set_assocdir ($assocdir);
     418    # 3. Read in all the metadata from the files in the archives directory using
     419    #    the GAPlug and using ourselves as the document processor!
     420    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
     421
     422    print STDERR "</Stage>\n" if $self->{'gli'};
     423}
    311424
    3124251;
  • trunk/gsdl/perllib/lucenebuildproc.pm

    r12426 r12844  
    3131# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
    3232
    33 use mgppbuildproc; 
     33use mgppbuildproc;
    3434use ghtml;
    3535use strict;
     
    3737
    3838
     39use IncrementalBuildUtils;
     40
    3941sub BEGIN {
    4042    @lucenebuildproc::ISA = ('mgppbuildproc');
     
    4547    my $class = shift @_;
    4648    my $self = new mgppbuildproc (@_);
     49
     50    $self->{'numincdocs'} = 0;
    4751
    4852    return bless $self, $class;
     
    8690
    8791    my $docid="";
    88     if ($ldoc_level) {
    89     if ($self->{'gdbm_level'} eq 'document') {
    90         my $doc_sec_num = $self->{'num_docs'}; 
     92    if ($ldoc_level)
     93      {
     94    if ($self->{'gdbm_level'} eq 'document')
     95          {
     96        my $doc_sec_num = $self->{'num_docs'};
    9197        $docid = "gs2:id=\"$doc_sec_num\"";
    92     } else  {
     98          }
     99        else
     100          {
    93101        # default is section level
    94         my $doc_sec_num = $self->{'num_sections'}+1;   
     102        my $doc_sec_num = $self->{'num_sections'} + 1;
    95103        $docid = "gs2:id=\"$doc_sec_num\"";
    96     }
    97     }
     104          }
     105      }
    98106    my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
    99107    my $documentendtag = "\n</$doc_level>\n";
    100    
    101     my ($sectiontag) = "";   
    102     if ($lsec_level) {
     108
     109    my ($sectiontag) = "";
     110    if ($lsec_level)
     111      {
    103112    $sectiontag = $mgppbuildproc::level_map{'section'};
    104     }
     113      }
    105114    my ($parastarttag) = "";
    106115    my ($paraendtag) = "";
    107     if ($self->{'levels'}->{'paragraph'}) {
    108     if ($self->{'strip_html'}) {
     116    if ($self->{'levels'}->{'paragraph'})
     117      {
     118    if ($self->{'strip_html'})
     119          {
    109120        $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
    110121        $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
    111     } else {
     122          }
     123        else
     124          {
    112125        print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
    113     }
    114     }
    115    
     126          }
     127      }
     128
    116129    my $doc_section = 0; # just for this document
    117130
     
    120133    # get the text for this document
    121134    my $section = $doc_obj->get_top_section();
    122     while (defined $section) {
     135    while (defined $section)
     136      {
    123137    # update a few statistics
    124138    $doc_section++;
    125     $self->{'num_sections'} += 1;
    126  
    127     if ($sectiontag ne "") {
    128         my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";       
     139    $self->{'num_sections'}++;
     140
     141    if ($sectiontag ne "")
     142          {
     143        my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
    129144        $text .= "\n<$sectiontag $secid >\n";
    130     }
     145          }
    131146
    132147    # if we are doing subcollections, then some docs shouldn't be indexed.
    133     # but we need to put the section tag placeholders in there so the 
     148    # but we need to put the section tag placeholders in there so the
    134149    # sections match up with gdbm db
    135150    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
    136151    if (!$indexed_doc || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
    137152        $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
    138         $section = $doc_obj->get_next_section($section);
     153            $section = $doc_obj->get_next_section($section);
    139154        next;
    140     }
    141    
     155          }
     156
    142157    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    143     foreach my $field (split (/;/, $fields)) {
     158    foreach my $field (split (/;/, $fields))
     159          {
    144160        # only deal with this field if it doesn't start with top or
    145161        # this is the first section
     
    147163        next if (($real_field =~ s/^top//) && ($doc_section != 1));
    148164
    149         my $new_text = "";
    150 
    151         # we get allfields by default - do nothing
    152         if ($real_field eq "allfields") {
    153 
    154         }   
     165        my $new_text = "";
     166        my $tmp_text = "";
     167
     168        # If allfields is requested add all metadata fields and text as
     169            # belonging to the ZZ field
     170        if ($real_field eq "allfields") {
     171              # Text first - no html nor paragraph tags
     172              $new_text .= "$parastarttag<ZZ index=\"1\">\n";
     173              $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
     174              &ghtml::htmlsafe($tmp_text);
     175              $new_text .= "$tmp_text</ZZ>$paraendtag\n";
     176              # Then Metadata
     177              my $metadata = $doc_obj->get_all_metadata ($section);
     178              foreach my $pair (@$metadata) {
     179                my ($mfield, $mvalue) = (@$pair);
     180                &ghtml::htmlsafe($mvalue);
     181                # check fields here, maybe others dont want - change to use dontindex!!
     182                if ($mfield ne "Identifier"
     183                    && $mfield !~ /^gsdl/
     184                    && $mfield ne "classifytype"
     185                    && $mfield ne "assocfilepath"
     186                    && defined $mvalue && $mvalue ne "") {
     187                  $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
     188                }
     189                if (!defined $self->{'indexfields'}->{$mfield}) {
     190                  $self->{'indexfields'}->{$mfield} = 1;
     191                }
     192              }
     193        }
    155194        # metadata - output all metadata we know about except gsdl stuff
    156         elsif ($real_field eq "metadata") {
     195        elsif ($real_field eq "metadata" || $real_field eq "allfields") {
    157196        my $shortname = "";
    158197        my $metadata = $doc_obj->get_all_metadata ($section);
    159198        foreach my $pair (@$metadata) {
    160199            my ($mfield, $mvalue) = (@$pair);
     200                    &ghtml::htmlsafe($mvalue);
    161201            # check fields here, maybe others dont want - change to use dontindex!!
    162202            if ($mfield ne "Identifier"
     
    165205            && $mfield ne "assocfilepath"
    166206            && defined $mvalue && $mvalue ne "") {
    167            
     207
    168208            if (defined $self->{'indexfieldmap'}->{$mfield}) {
    169209                $shortname = $self->{'indexfieldmap'}->{$mfield};
     
    173213                $self->{'indexfieldmap'}->{$mfield} = $shortname;
    174214                $self->{'indexfieldmap'}->{$shortname} = 1;
    175             }     
     215            }
    176216            $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
    177217            if (!defined $self->{'indexfields'}->{$mfield}) {
    178218                $self->{'indexfields'}->{$mfield} = 1;
    179             }                   
     219            }
    180220            }
    181221        }
    182222        }
    183         else { 
     223        else {
    184224        #individual metadata and or text specified - could be a comma separated list
    185225        my $shortname="";
     
    192232            $self->{'indexfieldmap'}->{$shortname} = 1;
    193233        }
    194        
     234
    195235        my @metadata_list = ();
    196236        foreach my $submeta (split /,/, $real_field) {
     
    209249                $new_text .= "$section_text</$shortname>$paraendtag\n";
    210250            }
    211             else {
    212                             # leave html stuff in, but escape the tags, and don't add Paragraph tags - never retrieve paras at the moment
    213                 &ghtml::htmlsafe($section_text);
    214                 $new_text .= $section_text;
     251            else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
     252                $tmp_text .= $doc_obj->get_text ($section);
     253                &ghtml::htmlsafe($tmp_text);
     254                $new_text .= $tmp_text;
    215255            }
    216256            }
     
    229269        }
    230270        }
    231        
    232271        # filter the text
    233272        $self->filter_text ($field, $new_text);
    234273        $self->{'num_processed_bytes'} += length ($new_text);
     274
    235275        $text .= "$new_text";
    236276    } # foreach field
    237        
     277
    238278    $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
    239279
    240     $section = $doc_obj->get_next_section($section);
     280        $section = $doc_obj->get_next_section($section);
    241281    } #while defined section
    242     print $handle "$text\n$documentendtag"; 
     282    print $handle "$text\n$documentendtag";
    243283    #print STDOUT "$text\n$documentendtag";
    244284}
    245285
     286# /** We make this builder pretend to be a document processor so we can get
     287#  *  information back from the plugins.
     288#  *
     289#  *  @param  $self    A reference to this Lucene builder
     290#  *  @param  $doc_obj A reference to a document object representing what was
     291#  *                   parsed by the GAPlug
     292#  *  @param  $file    The name of the file parsed as a string
     293#  *
     294#  *  @author John Thompson, DL Consulting Ltd
     295#  */
     296sub process()
     297  {
     298    my $self = shift (@_);
     299    my ($doc_obj, $file) = @_;
     300
     301    # If this is called from any stage other than an incremental infodb we want
     302    # to pass through to the superclass of build
     303    if ($self->get_mode() eq "incinfodb")
     304      {
     305        print STDERR "*** Processing a document added using INCINFODB ***\n";
     306        my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
     307        $archivedir = "" unless defined $archivedir;
     308        $archivedir =~ s/\\/\//g;
     309        $archivedir =~ s/^\/+//;
     310        $archivedir =~ s/\/+$//;
     311
     312        # Number of files
     313        print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
     314
     315        # resolve the final filenames of the files associated with this document
     316        $self->assoc_files ($doc_obj, $archivedir);
     317
     318        # is this a paged or a hierarchical document
     319        my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
     320
     321        # Determine the actual docnum by checking if we've processed any
     322        # previous incrementally added documents. If so, carry on from there.
     323        # Otherwise we set the counter to be the same as the number of
     324        # sections encountered during the previous build
     325        if ($self->{'numincdocs'} == 0)
     326          {
     327            $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
     328          }
     329
     330        my $section = $doc_obj->get_top_section ();
     331        print STDERR "+ top section: '$section'\n";
     332        my $doc_OID = $doc_obj->get_OID();
     333        my $url = "";
     334        while (defined $section)
     335          {
     336            print STDERR "+ processing section: '$section'\n";
     337            # Attach all the other metadata to this document
     338            # output the fact that this document is a document (unless doctype
     339            # has been set to something else from within a plugin
     340            my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
     341            if (!defined $dtype || $dtype !~ /\w/)
     342              {
     343                $doc_obj->add_utf8_metadata($section, "doctype", $dtype);
     344              }
     345            # output whether this node contains text
     346            if ($doc_obj->get_text_length($section) > 0)
     347              {
     348                $doc_obj->add_utf8_metadata($section, "hastxt", 1);
     349              }
     350            else
     351              {
     352                $doc_obj->add_utf8_metadata($section, "hastxt", 0);
     353              }
     354
     355            # output archivedir if at top level
     356            if ($section eq $doc_obj->get_top_section())
     357              {
     358                $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
     359        $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
     360              }
     361
     362            # output a list of children
     363            my $children = $doc_obj->get_children ($section);
     364            if (scalar(@$children) > 0)
     365              {
     366                $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
     367                my @contains = ();
     368                foreach my $child (@$children)
     369                  {
     370                    if ($child =~ /^.*?\.(\d+)$/)
     371                      {
     372                        push (@contains, "\".$1");
     373                      }
     374                    else
     375                      {
     376                        push (@contains, "\".$child");
     377                      }
     378                  }
     379                $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
     380              }
     381            #output the matching doc number
     382            print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
     383            $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
     384
     385            $self->{'numincdocs'}++;
     386            $section = $doc_obj->get_next_section($section);
     387            # if no sections wanted, only gdbm the docs
     388            last if ($self->{'gdbm_level'} eq "document");
     389          }
     390        print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
     391        &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
     392      }
     393    else
     394      {
     395        $self->mgppbuildproc::process(@_);
     396      }
     397  }
     398# /** process() **/
     399
     400sub get_num_docs {
     401    my $self = shift (@_);
     402    #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
     403    return $self->{'num_docs'};
     404}
     405
     406sub get_num_sections {
     407    my $self = shift (@_);
     408    #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
     409    return $self->{'num_sections'};
     410}
     411
     412# num_bytes is the actual number of bytes in the collection
     413# this is normally the same as what's processed during text compression
     414sub get_num_bytes {
     415    my $self = shift (@_);
     416    #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
     417    return $self->{'num_bytes'};
     418}
     419
    2464201;
    247421
  • trunk/gsdl/perllib/plugins/GAPlug.pm

    r12169 r12844  
    7070    $self->{'metadata_value'} = "";
    7171    $self->{'content'} = "";
     72
     73#    # Currently used to store information for previous values controls. In
     74#    # the next contract I'll move to using information directly from Lucene.
     75#    $self->{'sqlfh'} = 0;
    7276   
    7377    return bless $self, $class;
     
    128132    elsif ($element eq "Metadata") {
    129133    $self->{'doc_obj'}->add_utf8_metadata($self->{'section'}, $self->{'metadata_name'},$self->{'metadata_value'});
     134        # Ensure this value is added to the allvalues database in gseditor.
     135        # Note that the database constraints prevent multiple occurances of the
     136        # same key-value pair.
     137        # We write these out to a file, so they can all be commited in one
     138        # transaction
     139        #if (!$self->{'sqlfh'})
     140        #  {
     141        #    my $sql_file = $ENV{'GSDLHOME'} . "/collect/lld/tmp/gseditor.sql";
     142        #    # If the file doesn't already exist, open it and begin a transaction
     143        #    my $sql_fh;
     144        #    if (!-e $sql_file)
     145        #      {
     146        #        open($sql_fh, ">" . $sql_file);
     147        #        print $sql_fh "BEGIN TRANSACTION;\n";
     148        #      }
     149        #    else
     150        #      {
     151        #        open($sql_fh, ">>" . $sql_file);
     152        #      }
     153        #    print STDERR "Opened SQL log\n";
     154        #    $self->{'sqlfh'} = $sql_fh;
     155        #  }
     156
     157        #my $mvalue = $self->{'metadata_value'};
     158        #$mvalue =~ s/\'/\'\'/g;
     159        #$mvalue =~ s/_claimantsep_/ \& /g;
     160
     161        #my $fh = $self->{'sqlfh'};
     162        #if ($fh)
     163        #  {
     164        #    print $fh "INSERT INTO allvalues (mkey, mvalue) VALUES ('" . $self->{'metadata_name'} . "', '" . $mvalue . "');\n";
     165        #  }
     166
     167        # Clean Up
    130168    $self->{'metadata_name'} = "";
    131169    $self->{'metadata_value'} = "";
     
    135173    $self->{'content'} = "";
    136174    }
    137    
    138175    $self->{'element'} = "";
    139176}
     
    161198sub close_document {
    162199    my $self = shift(@_);
    163    
     200
    164201    # add the associated files
    165202    my $assoc_files =
     
    170207
    171208    my $assoc_filepath=shift (@$assoc_filepath_list);
    172     if (defined ($assoc_filepath)) {
    173     # make absolute rather than relative...
    174     $self->{'filename'} =~ m@^(.*[\\/]archives)@;
    175     $assoc_filepath = "$1/$assoc_filepath/";
    176     } else {
     209
     210    #rint STDERR "Filename is: " . $self->{'filename'} . "\n";
     211    #rint STDERR "Initially my assoc_filepath is: $assoc_filepath\n";
     212    #rint STDERR "Custom archive dir is: " . $self->{'base_dir'} . "\n";
     213    # Correct the assoc filepath if one is defined
     214    if (defined ($assoc_filepath))
     215      {
     216        # Check whether the assoc_filepath already includes the base dir
     217        if (index($assoc_filepath, $self->{'base_dir'}) == -1)
     218          {
     219            # And if not, append it so as to make this absolute
     220            $assoc_filepath = &util::filename_cat($self->{'base_dir'}, $assoc_filepath);
     221          }
     222      }
     223    else
     224      {
    177225    $assoc_filepath = $self->{'filename'};
    178226    $assoc_filepath =~ s/[^\\\/]*$//;
    179     }
     227      }
     228    #rint STDERR "Goned and made it absolute: $assoc_filepath\n";
    180229
    181230    foreach my $assoc_file_info (@$assoc_files) {
    182231    my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
     232        #rint STDERR "assoc_file: $assoc_file\n";
     233        #rint STDERR "mime_type: $mime_type\n";
     234        #rint STDERR "dir: $dir\n";
    183235    my $real_dir = &util::filename_cat($assoc_filepath, $assoc_file),
    184236    my $assoc_dir = (defined $dir && $dir ne "")
    185237        ? &util::filename_cat($dir, $assoc_file) : $assoc_file;
    186238    $self->{'doc_obj'}->associate_file($real_dir, $assoc_dir, $mime_type);
     239        #rint STDERR "According to me the real assoc_filepath is: $real_dir\n";
    187240    }
    188241    $self->{'doc_obj'}->delete_metadata($self->{'doc_obj'}->get_top_section(), "gsdlassocfile");
  • trunk/gsdl/perllib/plugins/XMLPlug.pm

    r12270 r12844  
    163163
    164164    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
     165    $self->{'base_dir'} = $base_dir;
    165166    $self->{'file'} = $file;
    166167    $self->{'filename'} = $filename;
Note: See TracChangeset for help on using the changeset viewer.