Context Navigation

← Previous Change
Next Change →

mgppbuildproc.pm

Timestamp:

2001-01-22T15:30:56+13:00 (23 years ago)

Author:

kjm18

Message:

heaps of changes

File:

: 1 edited

trunk/gsdl/perllib/mgppbuildproc.pm (modified) (16 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/mgppbuildproc.pm

-              r1772
+              r1852
 ###########################################################################
+#
 # mgbuildproc.pm --
+# mgppbuildproc.pm --
 # A component of the Greenstone digital library software
 # from the New Zealand Digital Library Project at the
 …
 # This document processor outputs a document
 # for mg to process
+# for mgpp to process
 …
     $self->{'num_processed_bytes'} = 0;
     $self->{'outhandle'} = $outhandle;
+    $self->{'dontindex'} = {};
+    $self->{'indexfieldmap'} = {};
     $self->{'indexing_text'} = 0;
     $self->{'indexfields'} = {};
+    $self->{'strip_html'}=1;
     return bless $self, $class;
 …
     return $self->{'indexing_text'};
+}
+sub set_indexfieldmap {
+    my $self = shift (@_);
+    my ($indexmap) = @_;
+    $self->{'indexfieldmap'} = $indexmap;
+}
+sub get_indexfieldmap {
+    my $self = shift (@_);
+    return $self->{'indexfieldmap'};
+}
+sub set_levels {
+    my $self = shift (@_);
+    my ($levels) = @_;
+    $self->{'levels'} = $levels;
+}
+sub set_strip_html {
+    my $self = shift (@_);
+    my ($strip) = @_;
+    $self->{'strip_html'}=$strip;
+}
 …
     my ($doc_obj, $filename) = @_;
     my $handle = $self->{'output_handle'};
-#    $handle = "main::STDOUT";
     my $doctype = $doc_obj->get_doc_type();
 …
     # only output this document if it is one to be indexed
     return if ($doctype ne "indexed_doc");
+    #if a Section level index is not built, the gdbm file should be at doc
+    #level not Section
+    my $docs_only = 1;
+    if ($self->{'levels'}->{'Section'}) {
+    $docs_only = 0;
+    }
     my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
 …
     # output all the section metadata
-    #my $found_doctype = 0;
     my $metadata = $doc_obj->get_all_metadata ($section);
     foreach $pair (@$metadata) {
         my ($field, $value) = (@$pair);
-        #$found_doctype = 1 if $field eq "doctype";
         if ($field ne "Identifier" && $field !~ /^gsdl/ &&
         defined $value && $value ne "") {
 …
+    }
-    # output the fact that this document is a document
-    # (unless doctype was already output as part of
-    # metadata)
-    #if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {
-    #    print $handle "<doctype>doc\n";
-    #}
     # output archivedir if at top level
     if ($section eq $doc_obj->get_top_section()) {
 …
+    }
+    # output a list of children
+    my $children = $doc_obj->get_children ($section);
+    if (scalar(@$children) > 0) {
+        print $handle "<childtype>$childtype\n";
+        print $handle "<contains>";
+        my $firstchild = 1;
+        foreach $child (@$children) {
+        print $handle ";" unless $firstchild;
+        $firstchild = 0;
+        if ($child =~ /^.*?\.(\d+)$/) {
+            print $handle "\".$1";
+        } else {
+            print $handle "\".$child";
+        }
+    if (!$docs_only) {
+        # output a list of children
+        my $children = $doc_obj->get_children ($section);
+        if (scalar(@$children) > 0) {
+        print $handle "<childtype>$childtype\n";
+        print $handle "<contains>";
+        my $firstchild = 1;
+        foreach $child (@$children) {
+            print $handle ";" unless $firstchild;
+            $firstchild = 0;
+            if ($child =~ /^.*?\.(\d+)$/) {
+            print $handle "\".$1";
+            } else {
+            print $handle "\".$child";
+            }
 #       if ($child eq "") { print $handle "$doc_OID"; }
 #       elsif ($section eq "") { print $handle "$doc_OID.$child"; }
 #       else { print $handle "$doc_OID.$section.$child"; }
+        }
+        print $handle "\n";
+    }
+    # output the matching document number
+    print $handle "<docnum>$self->{'num_sections'}\n";
+        }
+        print $handle "\n";
+        }
+        #output the matching doc number
+        print $handle "<docnum>$self->{'num_sections'}\n";
+    } # if (!$docs_only)
+    else { #docs only, doc num is num_docs not num_sections
+        # output the matching document number
+        print $handle "<docnum>$self->{'num_docs'}\n";
+    }
     print $handle '-' x 70, "\n";
     # output a database entry for the document number
+    print $handle "[$self->{'num_sections'}]\n";
+    if ($section eq "") { print $handle "<section>$doc_OID\n"; }
+    else { print $handle "<section>$doc_OID.$section\n"; }
+    if ($docs_only) {
+        print $handle "[$self->{'num_docs'}]\n";
+        print $handle "<section>$doc_OID\n";
+    }
+    else {
+        print $handle "[$self->{'num_sections'}]\n";
+        if ($section eq "") { print $handle "<section>$doc_OID\n"; }
+        else { print $handle "<section>$doc_OID.$section\n"; }
+    }
     print $handle '-' x 70, "\n";
 …
     $first = 0;
     $section = $doc_obj->get_next_section($section);
+    last if ($docs_only); # if no sections wanted, only gdbm the docs
+    }
 …
     $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
+}
+#this function strips the html tags from the doc if ($strip_html) and
+# if ($para) replaces <p> with <Paragraph> tags.
+# if both are false, the original text is returned
+#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
+#these tags
+sub preprocess_text {
+    my $self = shift (@_);
+    my ($text, $strip_html, $para) = @_;
+    my ($outtext) = "";
+    if ($strip_html) {
+    while ($text =~ /<([^>]*)>/ && $text ne "") {
+        $tag = $1;
+        $outtext .= $`." "; #add everything before the matched tag
+        $text = $'; #everything after the matched tag
+        if ($para && $tag =~ /^\s*p\s/) {
+        $outtext .= "<Paragraph> ";
+        }
+        elsif ($tag =~ /^pre$/) { # a pre tag
+        $text =~ /<\/pre>/; # find the closing pre tag
+        my $tmp_text = $`; #everything before the closing pre tag
+        $text = $'; #everything after the </pre>
+        $tmp_text =~ s/[<>]//g; # remove all < and >
+        $outtext.= $tmp_text . " ";
+        }
+    }
+    $outtext .= $text; # add any remaining text
+    return $outtext;
+    } #if strip_html
+    if ($para) {
+    $text =~ s/(<p\b)/<Paragraph>$1/gi;
+    return $text;
+    }
+    return $text;
+}
 sub filter_text {
 …
     # get the parameters for the output
     my ($fields) = $self->{'index'};
+    #print STDERR "fields are $fields\n";
+    $fields =~ s/\ball\b/Title,Creator,text/; # add in others here
+    my ($sectiontag) = "";
+    if ($self->{'levels'}->{'Section'}) {
+    $sectiontag = "\n<Section>\n";
+    }
+    my ($paratag) = "";
+    if ($self->{'levels'}->{'Paragraph'}) {
+    $paratag = "<Paragraph>";
+    }
     my $doc_section = 0; # just for this document
     my $text = "";
 …
     $doc_section++;
     $self->{'num_sections'} += 1;
+    $text .= "<Section>\n";
+    $text .= $sectiontag;
     if ($indexed_doc) {
         $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
 …
         if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
             my $new_text = "";
+            my $tmp_text = "";
             if ($real_field eq "text") {
+            #print STDERR "in text bit";
+            #$new_text = "<Paragraph>";
+            $new_text .= $doc_obj->get_text ($section);
+            #$self->find_paragraphs($new_text);
+            if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
+                $new_text .= "<TX>\n";
+                $tmp_text .= $doc_obj->get_text ($section);
+                $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, $self->{'levels'}->{'Paragraph'});
+                $new_text .= "$tmp_text</TX>\n";
+                if (!defined $self->{'indexfields'}->{'TextOnly'}) {
+                $self->{'indexfields'}->{'TextOnly'} = 1;
+                }
+            }
+            else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
+                $new_text .= $doc_obj->get_text ($section);
+                            #if ($self->{'levels'}->{'Paragraph'}) {
+                #$self->find_paragraphs($new_text);
+                #}
+            }
             } else { # metadata field
             if ($real_field eq "metadata") { # insert all metadata
                                              #except gsdl stuff
                 #print STDERR "in metadata bit\n";
+                #except gsdl stuff
+                my $shortname = "";
                 my $metadata = $doc_obj->get_all_metadata ($section);
                 foreach $pair (@$metadata) {
                 my ($mfield, $mvalue) = (@$pair);
+                #print STDERR "$mfield, $mvalue\n";
+                # check fields here, maybe others dont want
+                # check fields here, maybe others dont want - change to use dontindex!!
                 if ($mfield ne "Identifier" && $mfield ne "classifytype" &&
                     $mfield !~ /^gsdl/ && defined $mvalue && $mvalue ne "") {
+                    $new_text .= "<$mfield>$mvalue</$mfield>\n";
+                    #print STDERR "metadata=$mfield:$mvalue";
+                    if (!defined $self->{'indexfields'}->{$mfield}) {
+                        $self->{'indexfields'}->{$mfield} = 1;
+                    }
+                    if (defined $self->{'indexfieldmap'}->{$mfield}) {
+                    $shortname = $self->{'indexfieldmap'}->{$mfield};
+                    }
+                    else {
+                    $shortname = $self->create_shortname($mfield);
+                    $self->{'indexfieldmap'}->{$mfield} = $shortname;
+                    $self->{'indexfieldmap'}->{$shortname} = 1;
+                    }
+                    $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
+                    if (!defined $self->{'indexfields'}->{$mfield}) {
+                    $self->{'indexfields'}->{$mfield} = 1;
+                    }
+                }
+                }
+            }
             else { #individual metadata specified
+                my $shortname="";
                 if (!defined $self->{'indexfields'}->{$real_field}) {
                 $self->{'indexfields'}->{$real_field} = 1;
+                }
+                }
+                if (defined $self->{'indexfieldmap'}->{$real_field}) {
+                $shortname = $self->{'indexfieldmap'}->{$real_field};
+                }
+                else {
+                $shortname = $self->create_shortname($real_field);
+                $self->{'indexfieldmap'}->{$real_field} = $shortname;
+                $self->{'indexfieldmap'}->{$shortname} = 1;
+                }
                 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
                 $new_text .= "<$real_field>$item</$real_field>\n";
+                $new_text .= "$paratag<$shortname>$item</$shortname>\n";
+                }
+            }
 …
             $new_text =~ /[\(\)\{\}]/) {
+            }
+            $self->{'num_processed_bytes'} += length ($new_text);
             $text .= "$new_text";
+        }
 …
+}
+sub create_shortname {
+    $self = shift(@_);
+    my ($realname) = @_;
+    #take the first two chars
+    my ($shortname) = $realname =~ /^(\w\w)/;
+    $shortname =~ tr/a-z/A-Z/;
+    #if already used, take the first and third letters and so on
+    $count = 1;
+    while (defined $self->{'indexfieldmap'}->{$shortname}) {
+    if ($realname =~ /^(\w).{$count}(\w)/) {
+        $shortname = "$1$2";
+    $count++;
+    $shortname =~ tr/a-z/A-Z/;
+    }
+    else {
+        $realname =~ s/^.//;
+        $count = 0;
+    }
+    }
+    return $shortname;
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1852 for trunk/gsdl/perllib/mgppbuildproc.pm

Legend:

trunk/gsdl/perllib/mgppbuildproc.pm

Download in other formats: