Changeset 1852


Ignore:
Timestamp:
2001-01-22T15:30:56+13:00 (23 years ago)
Author:
kjm18
Message:

heaps of changes

Location:
trunk/gsdl/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgppbuilder.pm

    r1772 r1852  
    11###########################################################################
    22#
    3 # mgbuilder.pm -- MGBuilder object
     3# mgppbuilder.pm -- MGBuilder object
    44# A component of the Greenstone digital library software
    55# from the New Zealand Digital Library Project at the
     
    4848$maxdocsize = 12000;
    4949
    50 #update this !!!!!!!!!!!!!!!!
     50
    5151%wanted_index_files = ('td'=>1,
    5252               't'=>1,
     53               'tl'=>1,
     54               'ti'=>1,
    5355               'idb'=>1,
    5456               'ib1'=>1,
     
    5658               'ib3'=>1,
    5759               'i'=>1,
    58                'ip'=>1,
    59                'tiw'=>1,
     60               'il'=>1,
     61               'tw'=>1,
     62               'w'=>1,
    6063               'wa'=>1);
    6164
     65# change this so a user can add their own ones in via a file or cfg
     66%static_indexfield_map = ('Title'=>'TI',
     67              'TI'=>1,
     68              'Subject'=>'SU',
     69              'SU'=>1,
     70              'Creator'=>'CR',
     71              'CR'=>1,
     72              'Organization'=>'OR',
     73              'OR'=>1,
     74              'Source'=>'SO',
     75              'SO'=>1,
     76              'Howto'=>'HT',
     77              'HT'=>1,
     78              'ItemTitle'=>'IT',
     79              'IT'=>1,
     80              'ProgNumber'=>'PN',
     81              'PN'=>1,
     82              'People'=>'PE',
     83              'PE'=>1,
     84              'TextOnly'=>'TX',
     85              'TX'=>1);
    6286
    6387sub new {
     
    77101              'allclassifications'=>$allclassifications,
    78102              'outhandle'=>$outhandle,
    79               'notbuilt'=>[]    # indexes not built
    80               }, $class;
    81 
     103              'notbuilt'=>[],    # indexes not built
     104              'indexfieldmap'=>\%static_indexfield_map
     105          }, $class;
     106   
    82107
    83108    # read in the collection configuration file
     
    110135    }
    111136
     137    # get the levels (Section, Paragraph) for indexing and compression
     138    $self->{'levels'} = {};
     139    if (defined $self->{'collect_cfg'}->{'levels'}) {
     140        foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
     141            $self->{'levels'}->{$level} = 1;
     142        }
     143    } 
     144
    112145    # get the list of plugins for this collection
    113146    my $plugins = [];
     
    142175    # load up the document processor for building
    143176    # if a buildproc class has been created for this collection, use it
    144     # otherwise, use the mg buildproc
     177    # otherwise, use the mgpp buildproc
    145178    my ($buildprocdir, $buildproctype);
    146179    if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
     
    175208}
    176209
    177 sub build_collection {
    178     my $self = shift (@_);
    179     my ($textindex, $indexname) = @_;
    180 
    181     my $outhandle = $self->{'outhandle'};
    182 
    183     print $outhandle "build_col, textindex=$textindex, indexname=$indexname\n";
     210sub set_strip_html {
     211    my $self = shift (@_);
     212    my ($strip) = @_;
     213   
     214    $self->{'strip_html'} = $strip;
     215    $self->{'buildproc'}->set_strip_html($strip);
     216}
     217
     218sub compress_text {
     219
     220    my $self = shift (@_);
     221    my ($textindex) = @_;
     222
    184223    my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
    185224    my $exe = &util::get_os_exe ();
    186 
    187225    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
    188226    my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
    189     my $mg_perf_hash_build_exe =
    190     &util::filename_cat($exedir, "mg_perf_hash_build$exe");
    191     my $mg_weights_build_exe =
    192     &util::filename_cat ($exedir, "mg_weights_build$exe");
    193     my $mg_invf_dict_exe =
    194     &util::filename_cat ($exedir, "mg_invf_dict$exe");
    195     my $mg_stem_idx_exe =
    196     &util::filename_cat ($exedir, "mg_stem_idx$exe");
     227    my $outhandle = $self->{'outhandle'};
    197228
    198229    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
    199     my $basefilename = "$self->{'collection'}";
    200 #    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
    201  #   my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
    202     #                      $self->{'collection'});
    203 
    204     my $fulltextprefix=$self->{'build_dir'}; # note if this works, change all to $directory, change in mg calls!!!!!!!!!!!!!!
    205     my $fullindexprefix=$self->{'build_dir'};
    206 
    207     my $directory = $self->{'build_dir'};
    208     my $osextra = "";
    209     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    210     $fulltextprefix =~ s/\//\\/g;
    211     #$directory = ~s/\//\\/g;
    212     } else {
    213     $osextra = " -d /";
    214     }
    215 
    216     #indexname got from command line arg. if not specified, its "", so use
    217     # ones stated in cfg file
    218     my $indexes = [];
    219     if (!(defined $indexname && $indexname =~ /\w/)) {
    220     $indexes = $self->{'collect_cfg'}->{'indexes'};
    221     $indexname="Title,Organization,Magazine,text";
    222     }
    223     else {
    224     push @$indexes, $indexname;
    225     }
    226     print $outhandle "indexes are: @$indexes\n";
    227    
    228 
    229     print $outhandle "\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1);
    230     print $outhandle "fulltextprefix=$fulltextprefix\n";
    231     # carry out the first pass of mg_passes
    232     # -b $maxdocsize sets the maximum document size to be 12 meg - not available any longer
    233     print $outhandle "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
     230
     231    my $builddir = $self->{'build_dir'};
     232    my $basefilename = "text/$self->{'collection'}";
     233
     234# mgpp cant work on windows at the moment   
     235#     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     236#    $basefilename =~ s/\//\\/g;
     237#    $builddir =~ s/\//\\/g;
     238#   
     239#    }
     240
     241
     242    # define the section names for mgpasses
     243    # the compressor doesn't need to know about paragraphs - never want to
     244    # retrieve them
     245    my $mg_passes_sections = "";
     246    if ($self->{'levels'}->{'Section'}) {
     247    $mg_passes_sections .= "-K Section ";
     248    }
     249   
     250    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
     251
     252    # collect the statistics for the text
     253    # -b $maxdocsize sets the maximum document size to be 12 meg
     254    print $outhandle "\n    collecting text statistics (mg_passes -T1)\n"  if ($self->{'verbosity'} >= 1);
    234255
    235256    my ($handle);
     
    238259    } else {
    239260    if (!-e "$mg_passes_exe" ||
    240         !open (PIPEOUT, "| $mg_passes_exe -K Section  -T1 -I1 -d $fulltextprefix -f $basefilename")) {
     261        !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -T1")) {
    241262        die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
    242263    }
     
    244265    }
    245266
    246    
    247     #Assume that only going to build one index for now. so index will be
    248     # anything specified in cfg file
    249267    $self->{'buildproc'}->set_output_handle ($handle);
    250268    $self->{'buildproc'}->set_mode ('text');
    251     $self->{'buildproc'}->set_index ($indexname);
    252     $self->{'buildproc'}->set_indexing_text (1); # not used at the moment I think
     269    $self->{'buildproc'}->set_index ($textindex);
     270    $self->{'buildproc'}->set_indexing_text (0);
     271    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     272    $self->{'buildproc'}->set_levels ($self->{'levels'});                     
    253273    $self->{'buildproc'}->reset();
    254274    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
     
    261281    close ($handle) unless $self->{'debug'};
    262282
    263 
    264     # create the compression dictionary
    265     # the compression dictionary is built by assuming the stats are from a seed
    266     # dictionary (-S), if a novel word is encountered it is spelled out (-H),
    267     # and the resulting dictionary must be less than 5 meg with the most frequent
    268     # words being put into the dictionary first (-2 -k 5120)
    269     if (!$self->{'debug'}) {
    270     print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
    271     if (!-e "$mg_compression_dict_exe") {
    272         die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
    273     }
    274     system ("$mg_compression_dict_exe -d $fulltextprefix -f $basefilename");
    275 
    276     # create the perfect hash function
    277     if (!-e "$mg_perf_hash_build_exe") {
    278         die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
    279     }
    280     system ("$mg_perf_hash_build_exe -d $fullindexprefix -f $basefilename");
    281 
    282     # compress the text
    283     # -b $maxdocsize sets the maximum document size to be 12 meg
    284     if (!$self->{'debug'}) {
    285     if (!-e "$mg_passes_exe" ||
    286         !open ($handle, "| $mg_passes_exe -K Section  -d $fulltextprefix -f $basefilename -T2 -I2")) {
    287         die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
    288     }
    289     }
    290     }
    291    
    292     $self->{'buildproc'}->reset();
    293    
    294     print $outhandle "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
    295     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    296            "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    297     close ($handle) unless $self->{'debug'};
    298    
    299    
    300    
    301     # create the weights file
    302     print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
    303     if (!-e "$mg_weights_build_exe") {
    304     die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
    305     }
    306     system ("$mg_weights_build_exe -d $fullindexprefix -f $basefilename ");
    307    
    308     # create 'on-disk' stemmed dictionary
    309     print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
    310     if (!-e "$mg_invf_dict_exe") {
    311     die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
    312     }
    313     system ("$mg_invf_dict_exe -d $fullindexprefix -f $basefilename");
    314    
    315    
    316     # creates stem index files for the various stemming methods
    317     print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
    318     if (!-e "$mg_stem_idx_exe") {
    319     die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
    320     }
    321     system ("$mg_stem_idx_exe -b 4096 -s1 -d $fullindexprefix -f $basefilename");
    322     system ("$mg_stem_idx_exe -b 4096 -s2 -d $fullindexprefix -f $basefilename");
    323     system ("$mg_stem_idx_exe -b 4096 -s3 -d $fullindexprefix -f $basefilename");
    324 }
    325 
    326 
    327 #for mgpp with more than one index
    328 sub compress_text {
    329 
    330     my $self = shift (@_);
    331     my ($textindex) = @_;
    332 
    333 #    $textindex = "Title,Organization,Subject,Magazine,text";
    334     my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
    335     my $exe = &util::get_os_exe ();
    336     my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
    337     my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
    338     my $outhandle = $self->{'outhandle'};
    339 
    340     &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
    341 
    342     my $builddir = $self->{'build_dir'};
    343     my $basefilename = "text/$self->{'collection'}";
    344    
    345      if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    346      $basefilename =~ s/\//\\/g;
    347      $builddir =~ s/\//\\/g;
    348    
    349     }
    350 
    351     print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
    352 
    353     # collect the statistics for the text
    354     # -b $maxdocsize sets the maximum document size to be 12 meg
    355     print $outhandle "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
    356 
    357     my ($handle);
    358     if ($self->{'debug'}) {
    359     $handle = STDOUT;
    360     } else {
    361     if (!-e "$mg_passes_exe" ||
    362         !open (PIPEOUT, "| $mg_passes_exe -K Section  -d $builddir -f $basefilename -T1")) {
    363         die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
    364     }
    365     $handle = mgppbuilder::PIPEOUT;
    366     }
    367 
    368     $self->{'buildproc'}->set_output_handle ($handle);
    369     $self->{'buildproc'}->set_mode ('text');
    370     $self->{'buildproc'}->set_index ($textindex);
    371     $self->{'buildproc'}->set_indexing_text (0);
    372     $self->{'buildproc'}->reset();
    373     &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
    374            $self->{'buildproc'}, $self->{'maxdocs'});
    375     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    376            "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    377     &plugin::end($self->{'pluginfo'});
    378     close (PIPEOUT);
    379 
    380     close ($handle) unless $self->{'debug'};
    381 
    382283    # create the compression dictionary
    383284    # the compression dictionary is built by assuming the stats are from a seed
     
    385286    # and the resulting dictionary must be less than 5 meg with the most
    386287    # frequent words being put into the dictionary first (-2 -k 5120)
    387     # note: this options are left over from mg version
     288    # note: these options are left over from mg version
    388289    if (!$self->{'debug'}) {
    389290    print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
     
    396297    if (!$self->{'debug'}) {
    397298        if (!-e "$mg_passes_exe" ||
    398         !open ($handle, "| $mg_passes_exe -K Section -f $basefilename -d $builddir -T2")) {
     299        !open ($handle, "| $mg_passes_exe $mg_passes_compress_sections -f $basefilename -d $builddir -T2")) {
    399300        die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
    400301        }
     
    404305    $self->{'buildproc'}->reset();
    405306    # compress the text
    406     print $outhandle "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
     307    print $outhandle "\n    compressing the text (mg_passes -T2)\n"  if ($self->{'verbosity'} >= 1);
    407308    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    408309           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
     
    439340    $indexes = $self->{'collect_cfg'}->{'indexes'};
    440341    }
    441 
    442 #    push @$indexes, "text,Title,Organization,Magazine,Subject";
    443 #    push @$indexes, "Title,Organization,Magazine,Subject";
    444342
    445343    # create the mapping between the index descriptions
     
    593491    my $exe = &util::get_os_exe ();
    594492    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
     493
     494    # define the section names for mgpasses
     495    my $mg_passes_sections = "";
     496    foreach $level (keys (%{$self->{'levels'}})) {
     497    if ($level eq "Section" || $level eq "Paragraph") {
     498        $mg_passes_sections .= "-K $level ";
     499    }
     500    }
     501
    595502    my $mg_perf_hash_build_exe =
    596503    &util::filename_cat($exedir, "mg_perf_hash_build$exe");
     
    602509    &util::filename_cat ($exedir, "mg_stem_idx$exe");
    603510
    604     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    605     $builddir=~ s/\//\\/g;
    606     $basefilename =~ s/\//\\/g;
    607     }
     511#    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     512#   $builddir=~ s/\//\\/g;
     513#   $basefilename =~ s/\//\\/g;
     514#    }
    608515
    609516    # get the index expression if this index belongs
     
    631538
    632539    # Build index dictionary. Uses verbatim stem method
    633     print $outhandle "\n    creating index dictionary\n"  if ($self->{'verbosity'} >= 1);
     540    print $outhandle "\n    creating index dictionary (mg_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
    634541    my ($handle);
    635542    if ($self->{'debug'}) {
     
    637544    } else {
    638545    if (!-e "$mg_passes_exe" ||
    639         !open (PIPEOUT, "| $mg_passes_exe -K Section  -d $builddir -f $basefilename -I1")) {
     546        !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -I1")) {
    640547        die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
    641548    }
     
    648555    $self->{'buildproc'}->set_index ($index, $indexexparr);
    649556    $self->{'buildproc'}->set_indexing_text (1);
    650 
     557    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     558    $self->{'buildproc'}->set_levels ($self->{'levels'});                       
    651559    $self->{'buildproc'}->reset();
    652560    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
     
    664572
    665573    if (!-e "$mg_passes_exe" ||
    666         !open ($handle, "| $mg_passes_exe -K Section  -d $builddir -f $basefilename -I2")) {
     574        !open ($handle, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -I2")) {
    667575        die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
    668576    }
     
    670578   
    671579    # invert the text
    672     print $outhandle "\n    inverting the text\n"  if ($self->{'verbosity'} >= 1);
     580    print $outhandle "\n    inverting the text (mg_passes -I2)\n"  if ($self->{'verbosity'} >= 1);
    673581
    674582    $self->{'buildproc'}->reset();
     
    708616   
    709617    # remove unwanted files
    710 #   my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
    711 #   opendir (DIR, $tmpdir) || die
    712 #       "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
    713 #   foreach $file (readdir(DIR)) {
    714 #       next if $file =~ /^\./;
    715 #       my ($suffix) = $file =~ /\.([^\.]+)$/;
    716 #       if (defined $suffix && !defined $wanted_index_files{$suffix}) {
     618    my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
     619    opendir (DIR, $tmpdir) || die
     620        "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
     621    foreach $file (readdir(DIR)) {
     622        next if $file =~ /^\./;
     623        my ($suffix) = $file =~ /\.([^\.]+)$/;
     624        if (defined $suffix && !defined $wanted_index_files{$suffix}) {
    717625        # delete it!
    718 #       print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
    719 #       &util::rm (&util::filename_cat ($tmpdir, $file));
    720 #       }
    721 #   }
    722 #   closedir (DIR);
     626        print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
     627        &util::rm (&util::filename_cat ($tmpdir, $file));
     628        }
     629    }
     630    closedir (DIR);
    723631  }
    724632}   
     
    767675    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
    768676    $self->{'buildproc'}->set_indexing_text (0);
     677    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     678
    769679    $self->{'buildproc'}->reset();
    770680
     
    791701        }
    792702    }
     703    #print out the indexfield mapping
     704    foreach $field (keys(%{$self->{'indexfieldmap'}})) {
     705        $shortname = $self->{'indexfieldmap'}->{$field};
     706        print $handle "<$shortname>$field\n";
     707    }
    793708    print $handle "\n" . ('-' x 70) . "\n";
    794709
     
    851766    $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
    852767
     768    # store the indexfieldmap information
     769    my @indexfieldmap = ();
     770    #add all fields bit
     771    foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
     772    push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
     773    }
     774
     775    $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
    853776
    854777    #store the indexed field information
     
    860783    &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
    861784                 '^(builddate|buildtype|numdocs|numbytes)$',
    862                              '^(indexmap|subcollectionmap|languagemap|notbuilt|indexfields)$');
     785                             '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
    863786
    864787}
  • trunk/gsdl/perllib/mgppbuildproc.pm

    r1772 r1852  
    11###########################################################################
    22#
    3 # mgbuildproc.pm --
     3# mgppbuildproc.pm --
    44# A component of the Greenstone digital library software
    55# from the New Zealand Digital Library Project at the
     
    2525
    2626# This document processor outputs a document
    27 # for mg to process
     27# for mgpp to process
    2828
    2929
     
    6767    $self->{'num_processed_bytes'} = 0;
    6868    $self->{'outhandle'} = $outhandle;
     69    $self->{'dontindex'} = {};
     70    $self->{'indexfieldmap'} = {};
    6971
    7072    $self->{'indexing_text'} = 0;
    7173    $self->{'indexfields'} = {};
     74    $self->{'strip_html'}=1;
     75
    7276
    7377    return bless $self, $class;
     
    171175
    172176    return $self->{'indexing_text'};
     177}
     178
     179sub set_indexfieldmap {
     180    my $self = shift (@_);
     181    my ($indexmap) = @_;
     182
     183    $self->{'indexfieldmap'} = $indexmap;
     184}
     185
     186sub get_indexfieldmap {
     187    my $self = shift (@_);
     188
     189    return $self->{'indexfieldmap'};
     190}
     191
     192sub set_levels {
     193    my $self = shift (@_);
     194    my ($levels) = @_;
     195
     196    $self->{'levels'} = $levels;
     197}
     198
     199sub set_strip_html {
     200    my $self = shift (@_);
     201    my ($strip) = @_;
     202    $self->{'strip_html'}=$strip;
    173203}
    174204
     
    238268    my ($doc_obj, $filename) = @_;
    239269    my $handle = $self->{'output_handle'};
    240 #    $handle = "main::STDOUT";
    241270
    242271    my $doctype = $doc_obj->get_doc_type();
     
    244273    # only output this document if it is one to be indexed
    245274    return if ($doctype ne "indexed_doc");
     275
     276    #if a Section level index is not built, the gdbm file should be at doc
     277    #level not Section
     278    my $docs_only = 1;
     279    if ($self->{'levels'}->{'Section'}) {
     280    $docs_only = 0;
     281    }
    246282
    247283    my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
     
    287323
    288324    # output all the section metadata
    289     #my $found_doctype = 0;
    290325    my $metadata = $doc_obj->get_all_metadata ($section);
    291326    foreach $pair (@$metadata) {
    292327        my ($field, $value) = (@$pair);
    293328
    294         #$found_doctype = 1 if $field eq "doctype";
    295329        if ($field ne "Identifier" && $field !~ /^gsdl/ &&
    296330        defined $value && $value ne "") {
     
    315349    }
    316350
    317     # output the fact that this document is a document
    318     # (unless doctype was already output as part of
    319     # metadata)
    320     #if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {
    321     #    print $handle "<doctype>doc\n";
    322     #}
    323    
    324    
    325 
    326351    # output archivedir if at top level
    327352    if ($section eq $doc_obj->get_top_section()) {
     
    334359    }
    335360
    336     # output a list of children
    337     my $children = $doc_obj->get_children ($section);
    338     if (scalar(@$children) > 0) {
    339         print $handle "<childtype>$childtype\n";
    340         print $handle "<contains>";
    341         my $firstchild = 1;
    342         foreach $child (@$children) {
    343         print $handle ";" unless $firstchild;
    344         $firstchild = 0;
    345         if ($child =~ /^.*?\.(\d+)$/) {
    346             print $handle "\".$1";
    347         } else {
    348             print $handle "\".$child";
    349         }
     361    if (!$docs_only) {
     362        # output a list of children
     363        my $children = $doc_obj->get_children ($section);
     364        if (scalar(@$children) > 0) {
     365        print $handle "<childtype>$childtype\n";
     366        print $handle "<contains>";
     367        my $firstchild = 1;
     368        foreach $child (@$children) {
     369            print $handle ";" unless $firstchild;
     370            $firstchild = 0;
     371            if ($child =~ /^.*?\.(\d+)$/) {
     372            print $handle "\".$1";
     373            } else {
     374            print $handle "\".$child";
     375            }
    350376#       if ($child eq "") { print $handle "$doc_OID"; }
    351377#       elsif ($section eq "") { print $handle "$doc_OID.$child"; }
    352378#       else { print $handle "$doc_OID.$section.$child"; }
    353         }
    354         print $handle "\n";
    355     }
    356 
    357     # output the matching document number
    358     print $handle "<docnum>$self->{'num_sections'}\n";
    359 
     379        }
     380        print $handle "\n";
     381        }
     382        #output the matching doc number
     383        print $handle "<docnum>$self->{'num_sections'}\n";
     384       
     385    } # if (!$docs_only)
     386    else { #docs only, doc num is num_docs not num_sections
     387        # output the matching document number
     388        print $handle "<docnum>$self->{'num_docs'}\n";
     389    }
     390   
    360391    print $handle '-' x 70, "\n";
    361392
    362393   
    363394    # output a database entry for the document number
    364     print $handle "[$self->{'num_sections'}]\n";
    365     if ($section eq "") { print $handle "<section>$doc_OID\n"; }
    366     else { print $handle "<section>$doc_OID.$section\n"; }
     395    if ($docs_only) {
     396        print $handle "[$self->{'num_docs'}]\n";
     397        print $handle "<section>$doc_OID\n";
     398    }
     399    else {
     400        print $handle "[$self->{'num_sections'}]\n";
     401        if ($section eq "") { print $handle "<section>$doc_OID\n"; }
     402        else { print $handle "<section>$doc_OID.$section\n"; }
     403    }
    367404    print $handle '-' x 70, "\n";
    368405   
     
    374411    $first = 0;
    375412    $section = $doc_obj->get_next_section($section);
     413    last if ($docs_only); # if no sections wanted, only gdbm the docs
    376414    }
    377415
     
    384422    $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
    385423}
     424
     425#this function strips the html tags from the doc if ($strip_html) and
     426# if ($para) replaces <p> with <Paragraph> tags.
     427# if both are false, the original text is returned
     428#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
     429#these tags
     430sub preprocess_text {
     431    my $self = shift (@_);
     432    my ($text, $strip_html, $para) = @_;
     433
     434    my ($outtext) = "";
     435    if ($strip_html) {
     436    while ($text =~ /<([^>]*)>/ && $text ne "") {
     437       
     438        $tag = $1;
     439        $outtext .= $`." "; #add everything before the matched tag
     440        $text = $'; #everything after the matched tag
     441        if ($para && $tag =~ /^\s*p\s/) {
     442        $outtext .= "<Paragraph> ";
     443        }
     444        elsif ($tag =~ /^pre$/) { # a pre tag
     445        $text =~ /<\/pre>/; # find the closing pre tag
     446        my $tmp_text = $`; #everything before the closing pre tag
     447        $text = $'; #everything after the </pre>
     448        $tmp_text =~ s/[<>]//g; # remove all < and >
     449        $outtext.= $tmp_text . " ";
     450        }
     451    }
     452   
     453    $outtext .= $text; # add any remaining text
     454    return $outtext;
     455    } #if strip_html
     456
     457    if ($para) {
     458    $text =~ s/(<p\b)/<Paragraph>$1/gi;
     459    return $text;
     460    }
     461    return $text;
     462}
     463   
     464   
    386465
    387466sub filter_text {
     
    436515    # get the parameters for the output
    437516    my ($fields) = $self->{'index'};
    438     #print STDERR "fields are $fields\n";
    439     $fields =~ s/\ball\b/Title,Creator,text/; # add in others here
    440 
     517
     518    my ($sectiontag) = "";
     519    if ($self->{'levels'}->{'Section'}) {
     520    $sectiontag = "\n<Section>\n";
     521    }
     522    my ($paratag) = "";
     523    if ($self->{'levels'}->{'Paragraph'}) {
     524    $paratag = "<Paragraph>";
     525    }
    441526    my $doc_section = 0; # just for this document
    442527    my $text = "";
     
    455540    $doc_section++;
    456541    $self->{'num_sections'} += 1;
    457     $text .= "<Section>\n";
     542    $text .= $sectiontag;
     543
    458544    if ($indexed_doc) {
    459545        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     
    464550        if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
    465551            my $new_text = "";
     552            my $tmp_text = "";
    466553            if ($real_field eq "text") {
    467             #print STDERR "in text bit";
    468             #$new_text = "<Paragraph>";
    469             $new_text .= $doc_obj->get_text ($section);
    470             #$self->find_paragraphs($new_text);
     554            if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
     555                $new_text .= "<TX>\n";
     556                $tmp_text .= $doc_obj->get_text ($section);
     557                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, $self->{'levels'}->{'Paragraph'});
     558
     559                $new_text .= "$tmp_text</TX>\n";
     560                if (!defined $self->{'indexfields'}->{'TextOnly'}) {
     561                $self->{'indexfields'}->{'TextOnly'} = 1;   
     562                }
     563            }
     564            else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
     565                $new_text .= $doc_obj->get_text ($section);
     566                            #if ($self->{'levels'}->{'Paragraph'}) {
     567                #$self->find_paragraphs($new_text);
     568                #}               
     569            }
    471570            } else { # metadata field
    472571            if ($real_field eq "metadata") { # insert all metadata
    473                                              #except gsdl stuff
    474                 #print STDERR "in metadata bit\n";
     572                #except gsdl stuff
     573                my $shortname = "";
    475574                my $metadata = $doc_obj->get_all_metadata ($section);
    476575                foreach $pair (@$metadata) {
    477576                my ($mfield, $mvalue) = (@$pair);
    478                 #print STDERR "$mfield, $mvalue\n";
    479                 # check fields here, maybe others dont want
     577                # check fields here, maybe others dont want - change to use dontindex!!
    480578                if ($mfield ne "Identifier" && $mfield ne "classifytype" &&
    481579                    $mfield !~ /^gsdl/ && defined $mvalue && $mvalue ne "") {
    482                    
    483                     $new_text .= "<$mfield>$mvalue</$mfield>\n";
    484                     #print STDERR "metadata=$mfield:$mvalue";
    485                     if (!defined $self->{'indexfields'}->{$mfield}) {
    486                         $self->{'indexfields'}->{$mfield} = 1;
    487                     }                   
     580                   
     581                    if (defined $self->{'indexfieldmap'}->{$mfield}) {
     582                    $shortname = $self->{'indexfieldmap'}->{$mfield};
     583                    }
     584                    else {
     585                    $shortname = $self->create_shortname($mfield);
     586                    $self->{'indexfieldmap'}->{$mfield} = $shortname;
     587                    $self->{'indexfieldmap'}->{$shortname} = 1;
     588                    }     
     589                    $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
     590                    if (!defined $self->{'indexfields'}->{$mfield}) {
     591                    $self->{'indexfields'}->{$mfield} = 1;
     592                    }                   
    488593                }
    489594                }
    490 
     595               
    491596            }
    492597            else { #individual metadata specified
     598                my $shortname="";
    493599                if (!defined $self->{'indexfields'}->{$real_field}) {
    494600                $self->{'indexfields'}->{$real_field} = 1;
    495                 }               
     601                }
     602                if (defined $self->{'indexfieldmap'}->{$real_field}) {
     603                $shortname = $self->{'indexfieldmap'}->{$real_field};
     604                }
     605                else {
     606                $shortname = $self->create_shortname($real_field);
     607                $self->{'indexfieldmap'}->{$real_field} = $shortname;
     608                $self->{'indexfieldmap'}->{$shortname} = 1;
     609                }
    496610                foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
    497                 $new_text .= "<$real_field>$item</$real_field>\n";
     611                $new_text .= "$paratag<$shortname>$item</$shortname>\n";
    498612                }
    499613            }
     
    508622            $new_text =~ /[\(\)\{\}]/) {
    509623            }
    510 
     624            $self->{'num_processed_bytes'} += length ($new_text);
    511625            $text .= "$new_text";
    512626        }
     
    519633}
    520634
     635sub create_shortname {
     636    $self = shift(@_);
     637   
     638    my ($realname) = @_;
     639    #take the first two chars
     640    my ($shortname) = $realname =~ /^(\w\w)/;
     641    $shortname =~ tr/a-z/A-Z/;
     642
     643    #if already used, take the first and third letters and so on
     644    $count = 1;
     645    while (defined $self->{'indexfieldmap'}->{$shortname}) {
     646    if ($realname =~ /^(\w).{$count}(\w)/) {
     647        $shortname = "$1$2";
     648    $count++;
     649    $shortname =~ tr/a-z/A-Z/;
     650   
     651    }
     652    else {
     653        $realname =~ s/^.//;
     654        $count = 0;
     655    }
     656    }
     657
     658    return $shortname;
     659}
     660
    5216611;
    522662
Note: See TracChangeset for help on using the changeset viewer.