Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1852

Timestamp:

2001-01-22T15:30:56+13:00 (23 years ago)

Author:

kjm18

Message:

heaps of changes

Location:

trunk/gsdl/perllib

Files:

: 2 edited

mgppbuilder.pm (modified) (26 diffs)
mgppbuildproc.pm (modified) (16 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/mgppbuilder.pm

-              r1772
+              r1852
 ###########################################################################
+#
 # mgbuilder.pm -- MGBuilder object
+# mgppbuilder.pm -- MGBuilder object
 # A component of the Greenstone digital library software
 # from the New Zealand Digital Library Project at the
 …
 $maxdocsize = 12000;
+#update this !!!!!!!!!!!!!!!!
 %wanted_index_files = ('td'=>1,
                't'=>1,
+               'tl'=>1,
+               'ti'=>1,
                'idb'=>1,
                'ib1'=>1,
 …
                'ib3'=>1,
                'i'=>1,
+               'ip'=>1,
+               'tiw'=>1,
+               'il'=>1,
+               'tw'=>1,
+               'w'=>1,
                'wa'=>1);
+# change this so a user can add their own ones in via a file or cfg
+%static_indexfield_map = ('Title'=>'TI',
+              'TI'=>1,
+              'Subject'=>'SU',
+              'SU'=>1,
+              'Creator'=>'CR',
+              'CR'=>1,
+              'Organization'=>'OR',
+              'OR'=>1,
+              'Source'=>'SO',
+              'SO'=>1,
+              'Howto'=>'HT',
+              'HT'=>1,
+              'ItemTitle'=>'IT',
+              'IT'=>1,
+              'ProgNumber'=>'PN',
+              'PN'=>1,
+              'People'=>'PE',
+              'PE'=>1,
+              'TextOnly'=>'TX',
+              'TX'=>1);
 sub new {
 …
               'allclassifications'=>$allclassifications,
               'outhandle'=>$outhandle,
+              'notbuilt'=>[]    # indexes not built
+              }, $class;
+              'notbuilt'=>[],    # indexes not built
+              'indexfieldmap'=>\%static_indexfield_map
+          }, $class;
     # read in the collection configuration file
 …
+    }
+    # get the levels (Section, Paragraph) for indexing and compression
+    $self->{'levels'} = {};
+    if (defined $self->{'collect_cfg'}->{'levels'}) {
+        foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
+            $self->{'levels'}->{$level} = 1;
+        }
+    }
     # get the list of plugins for this collection
     my $plugins = [];
 …
     # load up the document processor for building
     # if a buildproc class has been created for this collection, use it
     # otherwise, use the mg buildproc
+    # otherwise, use the mgpp buildproc
     my ($buildprocdir, $buildproctype);
     if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
 …
+}
+sub build_collection {
+    my $self = shift (@_);
+    my ($textindex, $indexname) = @_;
+    my $outhandle = $self->{'outhandle'};
+    print $outhandle "build_col, textindex=$textindex, indexname=$indexname\n";
+sub set_strip_html {
+    my $self = shift (@_);
+    my ($strip) = @_;
+    $self->{'strip_html'} = $strip;
+    $self->{'buildproc'}->set_strip_html($strip);
+}
+sub compress_text {
+    my $self = shift (@_);
+    my ($textindex) = @_;
     my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
     my $exe = &util::get_os_exe ();
     my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
     my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
+    my $mg_perf_hash_build_exe =
+    &util::filename_cat($exedir, "mg_perf_hash_build$exe");
+    my $mg_weights_build_exe =
+    &util::filename_cat ($exedir, "mg_weights_build$exe");
+    my $mg_invf_dict_exe =
+    &util::filename_cat ($exedir, "mg_invf_dict$exe");
+    my $mg_stem_idx_exe =
+    &util::filename_cat ($exedir, "mg_stem_idx$exe");
+    my $outhandle = $self->{'outhandle'};
     &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
+    my $basefilename = "$self->{'collection'}";
+#    my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
+ #   my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
+    #                      $self->{'collection'});
+    my $fulltextprefix=$self->{'build_dir'}; # note if this works, change all to $directory, change in mg calls!!!!!!!!!!!!!!
+    my $fullindexprefix=$self->{'build_dir'};
+    my $directory = $self->{'build_dir'};
+    my $osextra = "";
+    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
+    $fulltextprefix =~ s/\//\\/g;
+    #$directory = ~s/\//\\/g;
+    } else {
+    $osextra = " -d /";
+    }
+    #indexname got from command line arg. if not specified, its "", so use
+    # ones stated in cfg file
+    my $indexes = [];
+    if (!(defined $indexname && $indexname =~ /\w/)) {
+    $indexes = $self->{'collect_cfg'}->{'indexes'};
+    $indexname="Title,Organization,Magazine,text";
+    }
+    else {
+    push @$indexes, $indexname;
+    }
+    print $outhandle "indexes are: @$indexes\n";
+    print $outhandle "\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1);
+    print $outhandle "fulltextprefix=$fulltextprefix\n";
+    # carry out the first pass of mg_passes
+    # -b $maxdocsize sets the maximum document size to be 12 meg - not available any longer
+    print $outhandle "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
+    my $builddir = $self->{'build_dir'};
+    my $basefilename = "text/$self->{'collection'}";
+# mgpp cant work on windows at the moment
+#     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
+#    $basefilename =~ s/\//\\/g;
+#    $builddir =~ s/\//\\/g;
+#
+#    }
+    # define the section names for mgpasses
+    # the compressor doesn't need to know about paragraphs - never want to
+    # retrieve them
+    my $mg_passes_sections = "";
+    if ($self->{'levels'}->{'Section'}) {
+    $mg_passes_sections .= "-K Section ";
+    }
+    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
+    # collect the statistics for the text
+    # -b $maxdocsize sets the maximum document size to be 12 meg
+    print $outhandle "\n    collecting text statistics (mg_passes -T1)\n"  if ($self->{'verbosity'} >= 1);
     my ($handle);
 …
     } else {
     if (!-e "$mg_passes_exe" ||
         !open (PIPEOUT, "| $mg_passes_exe -K Section  -T1 -I1 -d $fulltextprefix -f $basefilename")) {
+        !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -T1")) {
         die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
+    }
 …
+    }
-    #Assume that only going to build one index for now. so index will be
-    # anything specified in cfg file
     $self->{'buildproc'}->set_output_handle ($handle);
     $self->{'buildproc'}->set_mode ('text');
+    $self->{'buildproc'}->set_index ($indexname);
+    $self->{'buildproc'}->set_indexing_text (1); # not used at the moment I think
+    $self->{'buildproc'}->set_index ($textindex);
+    $self->{'buildproc'}->set_indexing_text (0);
+    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
+    $self->{'buildproc'}->set_levels ($self->{'levels'});
     $self->{'buildproc'}->reset();
     &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
 …
     close ($handle) unless $self->{'debug'};
-    # create the compression dictionary
-    # the compression dictionary is built by assuming the stats are from a seed
-    # dictionary (-S), if a novel word is encountered it is spelled out (-H),
-    # and the resulting dictionary must be less than 5 meg with the most frequent
-    # words being put into the dictionary first (-2 -k 5120)
-    if (!$self->{'debug'}) {
-    print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
-    if (!-e "$mg_compression_dict_exe") {
-        die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
+    }
-    system ("$mg_compression_dict_exe -d $fulltextprefix -f $basefilename");
-    # create the perfect hash function
-    if (!-e "$mg_perf_hash_build_exe") {
-        die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
+    }
-    system ("$mg_perf_hash_build_exe -d $fullindexprefix -f $basefilename");
-    # compress the text
-    # -b $maxdocsize sets the maximum document size to be 12 meg
-    if (!$self->{'debug'}) {
-    if (!-e "$mg_passes_exe" ||
-        !open ($handle, "| $mg_passes_exe -K Section  -d $fulltextprefix -f $basefilename -T2 -I2")) {
-        die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
+    }
+    }
+    }
-    $self->{'buildproc'}->reset();
-    print $outhandle "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
-    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
-           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
-    close ($handle) unless $self->{'debug'};
-    # create the weights file
-    print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
-    if (!-e "$mg_weights_build_exe") {
-    die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
+    }
-    system ("$mg_weights_build_exe -d $fullindexprefix -f $basefilename ");
-    # create 'on-disk' stemmed dictionary
-    print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
-    if (!-e "$mg_invf_dict_exe") {
-    die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
+    }
-    system ("$mg_invf_dict_exe -d $fullindexprefix -f $basefilename");
-    # creates stem index files for the various stemming methods
-    print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
-    if (!-e "$mg_stem_idx_exe") {
-    die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
+    }
-    system ("$mg_stem_idx_exe -b 4096 -s1 -d $fullindexprefix -f $basefilename");
-    system ("$mg_stem_idx_exe -b 4096 -s2 -d $fullindexprefix -f $basefilename");
-    system ("$mg_stem_idx_exe -b 4096 -s3 -d $fullindexprefix -f $basefilename");
+}
-#for mgpp with more than one index
-sub compress_text {
-    my $self = shift (@_);
-    my ($textindex) = @_;
-#    $textindex = "Title,Organization,Subject,Magazine,text";
-    my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
-    my $exe = &util::get_os_exe ();
-    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
-    my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
-    my $outhandle = $self->{'outhandle'};
-    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
-    my $builddir = $self->{'build_dir'};
-    my $basefilename = "text/$self->{'collection'}";
-     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
-     $basefilename =~ s/\//\\/g;
-     $builddir =~ s/\//\\/g;
+    }
-    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
-    # collect the statistics for the text
-    # -b $maxdocsize sets the maximum document size to be 12 meg
-    print $outhandle "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
-    my ($handle);
-    if ($self->{'debug'}) {
-    $handle = STDOUT;
-    } else {
-    if (!-e "$mg_passes_exe" ||
-        !open (PIPEOUT, "| $mg_passes_exe -K Section  -d $builddir -f $basefilename -T1")) {
-        die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
+    }
-    $handle = mgppbuilder::PIPEOUT;
+    }
-    $self->{'buildproc'}->set_output_handle ($handle);
-    $self->{'buildproc'}->set_mode ('text');
-    $self->{'buildproc'}->set_index ($textindex);
-    $self->{'buildproc'}->set_indexing_text (0);
-    $self->{'buildproc'}->reset();
-    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
-           $self->{'buildproc'}, $self->{'maxdocs'});
-    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
-           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
-    &plugin::end($self->{'pluginfo'});
-    close (PIPEOUT);
-    close ($handle) unless $self->{'debug'};
     # create the compression dictionary
     # the compression dictionary is built by assuming the stats are from a seed
 …
     # and the resulting dictionary must be less than 5 meg with the most
     # frequent words being put into the dictionary first (-2 -k 5120)
     # note: this options are left over from mg version
+    # note: these options are left over from mg version
     if (!$self->{'debug'}) {
     print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
 …
     if (!$self->{'debug'}) {
         if (!-e "$mg_passes_exe" ||
         !open ($handle, "| $mg_passes_exe -K Section  -f $basefilename -d $builddir -T2")) {
+        !open ($handle, "| $mg_passes_exe $mg_passes_compress_sections -f $basefilename -d $builddir -T2")) {
         die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
+        }
 …
     $self->{'buildproc'}->reset();
     # compress the text
     print $outhandle "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
+    print $outhandle "\n    compressing the text (mg_passes -T2)\n"  if ($self->{'verbosity'} >= 1);
     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
            "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
 …
     $indexes = $self->{'collect_cfg'}->{'indexes'};
+    }
-#    push @$indexes, "text,Title,Organization,Magazine,Subject";
-#    push @$indexes, "Title,Organization,Magazine,Subject";
     # create the mapping between the index descriptions
 …
     my $exe = &util::get_os_exe ();
     my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
+    # define the section names for mgpasses
+    my $mg_passes_sections = "";
+    foreach $level (keys (%{$self->{'levels'}})) {
+    if ($level eq "Section" || $level eq "Paragraph") {
+        $mg_passes_sections .= "-K $level ";
+    }
+    }
     my $mg_perf_hash_build_exe =
     &util::filename_cat($exedir, "mg_perf_hash_build$exe");
 …
     &util::filename_cat ($exedir, "mg_stem_idx$exe");
     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     $builddir=~ s/\//\\/g;
     $basefilename =~ s/\//\\/g;
+    }
+#    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
+#   $builddir=~ s/\//\\/g;
+#   $basefilename =~ s/\//\\/g;
+#    }
     # get the index expression if this index belongs
 …
     # Build index dictionary. Uses verbatim stem method
     print $outhandle "\n    creating index dictionary\n"  if ($self->{'verbosity'} >= 1);
+    print $outhandle "\n    creating index dictionary (mg_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
     my ($handle);
     if ($self->{'debug'}) {
 …
     } else {
     if (!-e "$mg_passes_exe" ||
         !open (PIPEOUT, "| $mg_passes_exe -K Section  -d $builddir -f $basefilename -I1")) {
+        !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -I1")) {
         die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
+    }
 …
     $self->{'buildproc'}->set_index ($index, $indexexparr);
     $self->{'buildproc'}->set_indexing_text (1);
+    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
+    $self->{'buildproc'}->set_levels ($self->{'levels'});
     $self->{'buildproc'}->reset();
     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
 …
     if (!-e "$mg_passes_exe" ||
         !open ($handle, "| $mg_passes_exe -K Section  -d $builddir -f $basefilename -I2")) {
+        !open ($handle, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -I2")) {
         die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
+    }
 …
     # invert the text
     print $outhandle "\n    inverting the text\n"  if ($self->{'verbosity'} >= 1);
+    print $outhandle "\n    inverting the text (mg_passes -I2)\n"  if ($self->{'verbosity'} >= 1);
     $self->{'buildproc'}->reset();
 …
     # remove unwanted files
 #   my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
 #   opendir (DIR, $tmpdir) || die
 #       "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
 #   foreach $file (readdir(DIR)) {
 #       next if $file =~ /^\./;
 #       my ($suffix) = $file =~ /\.([^\.]+)$/;
 #       if (defined $suffix && !defined $wanted_index_files{$suffix}) {
+    my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
+    opendir (DIR, $tmpdir) || die
+        "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
+    foreach $file (readdir(DIR)) {
+        next if $file =~ /^\./;
+        my ($suffix) = $file =~ /\.([^\.]+)$/;
+        if (defined $suffix && !defined $wanted_index_files{$suffix}) {
         # delete it!
 #       print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
 #       &util::rm (&util::filename_cat ($tmpdir, $file));
 #       }
 #   }
 #   closedir (DIR);
+        print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
+        &util::rm (&util::filename_cat ($tmpdir, $file));
+        }
+    }
+    closedir (DIR);
+  }
+}
 …
     $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
     $self->{'buildproc'}->set_indexing_text (0);
+    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     $self->{'buildproc'}->reset();
 …
+        }
+    }
+    #print out the indexfield mapping
+    foreach $field (keys(%{$self->{'indexfieldmap'}})) {
+        $shortname = $self->{'indexfieldmap'}->{$field};
+        print $handle "<$shortname>$field\n";
+    }
     print $handle "\n" . ('-' x 70) . "\n";
 …
     $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
+    # store the indexfieldmap information
+    my @indexfieldmap = ();
+    #add all fields bit
+    foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
+    push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}");
+    }
+    $build_cfg->{'indexfieldmap'} = \@indexfieldmap;
     #store the indexed field information
 …
     &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
                  '^(builddate|buildtype|numdocs|numbytes)$',
                              '^(indexmap|subcollectionmap|languagemap|notbuilt|indexfields)$');
+                             '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
+}

trunk/gsdl/perllib/mgppbuildproc.pm

-              r1772
+              r1852
 ###########################################################################
+#
 # mgbuildproc.pm --
+# mgppbuildproc.pm --
 # A component of the Greenstone digital library software
 # from the New Zealand Digital Library Project at the
 …
 # This document processor outputs a document
 # for mg to process
+# for mgpp to process
 …
     $self->{'num_processed_bytes'} = 0;
     $self->{'outhandle'} = $outhandle;
+    $self->{'dontindex'} = {};
+    $self->{'indexfieldmap'} = {};
     $self->{'indexing_text'} = 0;
     $self->{'indexfields'} = {};
+    $self->{'strip_html'}=1;
     return bless $self, $class;
 …
     return $self->{'indexing_text'};
+}
+sub set_indexfieldmap {
+    my $self = shift (@_);
+    my ($indexmap) = @_;
+    $self->{'indexfieldmap'} = $indexmap;
+}
+sub get_indexfieldmap {
+    my $self = shift (@_);
+    return $self->{'indexfieldmap'};
+}
+sub set_levels {
+    my $self = shift (@_);
+    my ($levels) = @_;
+    $self->{'levels'} = $levels;
+}
+sub set_strip_html {
+    my $self = shift (@_);
+    my ($strip) = @_;
+    $self->{'strip_html'}=$strip;
+}
 …
     my ($doc_obj, $filename) = @_;
     my $handle = $self->{'output_handle'};
-#    $handle = "main::STDOUT";
     my $doctype = $doc_obj->get_doc_type();
 …
     # only output this document if it is one to be indexed
     return if ($doctype ne "indexed_doc");
+    #if a Section level index is not built, the gdbm file should be at doc
+    #level not Section
+    my $docs_only = 1;
+    if ($self->{'levels'}->{'Section'}) {
+    $docs_only = 0;
+    }
     my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
 …
     # output all the section metadata
-    #my $found_doctype = 0;
     my $metadata = $doc_obj->get_all_metadata ($section);
     foreach $pair (@$metadata) {
         my ($field, $value) = (@$pair);
-        #$found_doctype = 1 if $field eq "doctype";
         if ($field ne "Identifier" && $field !~ /^gsdl/ &&
         defined $value && $value ne "") {
 …
+    }
-    # output the fact that this document is a document
-    # (unless doctype was already output as part of
-    # metadata)
-    #if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {
-    #    print $handle "<doctype>doc\n";
-    #}
     # output archivedir if at top level
     if ($section eq $doc_obj->get_top_section()) {
 …
+    }
+    # output a list of children
+    my $children = $doc_obj->get_children ($section);
+    if (scalar(@$children) > 0) {
+        print $handle "<childtype>$childtype\n";
+        print $handle "<contains>";
+        my $firstchild = 1;
+        foreach $child (@$children) {
+        print $handle ";" unless $firstchild;
+        $firstchild = 0;
+        if ($child =~ /^.*?\.(\d+)$/) {
+            print $handle "\".$1";
+        } else {
+            print $handle "\".$child";
+        }
+    if (!$docs_only) {
+        # output a list of children
+        my $children = $doc_obj->get_children ($section);
+        if (scalar(@$children) > 0) {
+        print $handle "<childtype>$childtype\n";
+        print $handle "<contains>";
+        my $firstchild = 1;
+        foreach $child (@$children) {
+            print $handle ";" unless $firstchild;
+            $firstchild = 0;
+            if ($child =~ /^.*?\.(\d+)$/) {
+            print $handle "\".$1";
+            } else {
+            print $handle "\".$child";
+            }
 #       if ($child eq "") { print $handle "$doc_OID"; }
 #       elsif ($section eq "") { print $handle "$doc_OID.$child"; }
 #       else { print $handle "$doc_OID.$section.$child"; }
+        }
+        print $handle "\n";
+    }
+    # output the matching document number
+    print $handle "<docnum>$self->{'num_sections'}\n";
+        }
+        print $handle "\n";
+        }
+        #output the matching doc number
+        print $handle "<docnum>$self->{'num_sections'}\n";
+    } # if (!$docs_only)
+    else { #docs only, doc num is num_docs not num_sections
+        # output the matching document number
+        print $handle "<docnum>$self->{'num_docs'}\n";
+    }
     print $handle '-' x 70, "\n";
     # output a database entry for the document number
+    print $handle "[$self->{'num_sections'}]\n";
+    if ($section eq "") { print $handle "<section>$doc_OID\n"; }
+    else { print $handle "<section>$doc_OID.$section\n"; }
+    if ($docs_only) {
+        print $handle "[$self->{'num_docs'}]\n";
+        print $handle "<section>$doc_OID\n";
+    }
+    else {
+        print $handle "[$self->{'num_sections'}]\n";
+        if ($section eq "") { print $handle "<section>$doc_OID\n"; }
+        else { print $handle "<section>$doc_OID.$section\n"; }
+    }
     print $handle '-' x 70, "\n";
 …
     $first = 0;
     $section = $doc_obj->get_next_section($section);
+    last if ($docs_only); # if no sections wanted, only gdbm the docs
+    }
 …
     $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
+}
+#this function strips the html tags from the doc if ($strip_html) and
+# if ($para) replaces <p> with <Paragraph> tags.
+# if both are false, the original text is returned
+#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
+#these tags
+sub preprocess_text {
+    my $self = shift (@_);
+    my ($text, $strip_html, $para) = @_;
+    my ($outtext) = "";
+    if ($strip_html) {
+    while ($text =~ /<([^>]*)>/ && $text ne "") {
+        $tag = $1;
+        $outtext .= $`." "; #add everything before the matched tag
+        $text = $'; #everything after the matched tag
+        if ($para && $tag =~ /^\s*p\s/) {
+        $outtext .= "<Paragraph> ";
+        }
+        elsif ($tag =~ /^pre$/) { # a pre tag
+        $text =~ /<\/pre>/; # find the closing pre tag
+        my $tmp_text = $`; #everything before the closing pre tag
+        $text = $'; #everything after the </pre>
+        $tmp_text =~ s/[<>]//g; # remove all < and >
+        $outtext.= $tmp_text . " ";
+        }
+    }
+    $outtext .= $text; # add any remaining text
+    return $outtext;
+    } #if strip_html
+    if ($para) {
+    $text =~ s/(<p\b)/<Paragraph>$1/gi;
+    return $text;
+    }
+    return $text;
+}
 sub filter_text {
 …
     # get the parameters for the output
     my ($fields) = $self->{'index'};
+    #print STDERR "fields are $fields\n";
+    $fields =~ s/\ball\b/Title,Creator,text/; # add in others here
+    my ($sectiontag) = "";
+    if ($self->{'levels'}->{'Section'}) {
+    $sectiontag = "\n<Section>\n";
+    }
+    my ($paratag) = "";
+    if ($self->{'levels'}->{'Paragraph'}) {
+    $paratag = "<Paragraph>";
+    }
     my $doc_section = 0; # just for this document
     my $text = "";
 …
     $doc_section++;
     $self->{'num_sections'} += 1;
+    $text .= "<Section>\n";
+    $text .= $sectiontag;
     if ($indexed_doc) {
         $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
 …
         if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
             my $new_text = "";
+            my $tmp_text = "";
             if ($real_field eq "text") {
+            #print STDERR "in text bit";
+            #$new_text = "<Paragraph>";
+            $new_text .= $doc_obj->get_text ($section);
+            #$self->find_paragraphs($new_text);
+            if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
+                $new_text .= "<TX>\n";
+                $tmp_text .= $doc_obj->get_text ($section);
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, $self->{'levels'}->{'Paragraph'});
+                $new_text .= "$tmp_text</TX>\n";
+                if (!defined $self->{'indexfields'}->{'TextOnly'}) {
+                $self->{'indexfields'}->{'TextOnly'} = 1;
+                }
+            }
+            else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
+                $new_text .= $doc_obj->get_text ($section);
+                            #if ($self->{'levels'}->{'Paragraph'}) {
+                #$self->find_paragraphs($new_text);
+                #}
+            }
             } else { # metadata field
             if ($real_field eq "metadata") { # insert all metadata
                                              #except gsdl stuff
                 #print STDERR "in metadata bit\n";
+                #except gsdl stuff
+                my $shortname = "";
                 my $metadata = $doc_obj->get_all_metadata ($section);
                 foreach $pair (@$metadata) {
                 my ($mfield, $mvalue) = (@$pair);
+                #print STDERR "$mfield, $mvalue\n";
+                # check fields here, maybe others dont want
+                # check fields here, maybe others dont want - change to use dontindex!!
                 if ($mfield ne "Identifier" && $mfield ne "classifytype" &&
                     $mfield !~ /^gsdl/ && defined $mvalue && $mvalue ne "") {
+                    $new_text .= "<$mfield>$mvalue</$mfield>\n";
+                    #print STDERR "metadata=$mfield:$mvalue";
+                    if (!defined $self->{'indexfields'}->{$mfield}) {
+                        $self->{'indexfields'}->{$mfield} = 1;
+                    }
+                    if (defined $self->{'indexfieldmap'}->{$mfield}) {
+                    $shortname = $self->{'indexfieldmap'}->{$mfield};
+                    }
+                    else {
+                    $shortname = $self->create_shortname($mfield);
+                    $self->{'indexfieldmap'}->{$mfield} = $shortname;
+                    $self->{'indexfieldmap'}->{$shortname} = 1;
+                    }
+                    $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
+                    if (!defined $self->{'indexfields'}->{$mfield}) {
+                    $self->{'indexfields'}->{$mfield} = 1;
+                    }
+                }
+                }
+            }
             else { #individual metadata specified
+                my $shortname="";
                 if (!defined $self->{'indexfields'}->{$real_field}) {
                 $self->{'indexfields'}->{$real_field} = 1;
+                }
+                }
+                if (defined $self->{'indexfieldmap'}->{$real_field}) {
+                $shortname = $self->{'indexfieldmap'}->{$real_field};
+                }
+                else {
+                $shortname = $self->create_shortname($real_field);
+                $self->{'indexfieldmap'}->{$real_field} = $shortname;
+                $self->{'indexfieldmap'}->{$shortname} = 1;
+                }
                 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
                 $new_text .= "<$real_field>$item</$real_field>\n";
+                $new_text .= "$paratag<$shortname>$item</$shortname>\n";
+                }
+            }
 …
             $new_text =~ /[\(\)\{\}]/) {
+            }
+            $self->{'num_processed_bytes'} += length ($new_text);
             $text .= "$new_text";
+        }
 …
+}
+sub create_shortname {
+    $self = shift(@_);
+    my ($realname) = @_;
+    #take the first two chars
+    my ($shortname) = $realname =~ /^(\w\w)/;
+    $shortname =~ tr/a-z/A-Z/;
+    #if already used, take the first and third letters and so on
+    $count = 1;
+    while (defined $self->{'indexfieldmap'}->{$shortname}) {
+    if ($realname =~ /^(\w).{$count}(\w)/) {
+        $shortname = "$1$2";
+    $count++;
+    $shortname =~ tr/a-z/A-Z/;
+    }
+    else {
+        $realname =~ s/^.//;
+        $count = 0;
+    }
+    }
+    return $shortname;
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1852

Legend:

trunk/gsdl/perllib/mgppbuilder.pm

trunk/gsdl/perllib/mgppbuildproc.pm

Download in other formats: