Context Navigation

← Previous Changeset
Next Changeset →

Changeset 10426

Timestamp:

2005-08-05T15:16:47+12:00 (19 years ago)

Author:

chi

Message:

Add an option -extracted_word_metadata to extract metadata based on user-defined fields from HTML (converted by
VB Scripting) document

File:

: 1 edited

trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm (modified) (14 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm

-              r10404
+              r10426
     return bless $self, $class;
+}
 …
     print $outhandle "StructuredHTMLPlug: processing $file\n"
         if $self->{'verbosity'} > 1;
     my @head_and_body = split(/<body/i,$$textref);
     my $head = shift(@head_and_body);
     my $body_text = join("<body", @head_and_body);
+    if (defined $self->{'extracted_word_metadata_fields'}) {
+    my @doc_properties = split(/<xml>/i,$head);
+    my $doc_heading = shift(@doc_properties);
+    my $rest_doc_properties = join(" ", @doc_properties);
+    my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
+    my $extracted_metadata = shift (@extracted_metadata);
+    $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
+    }
     # If checkout_toc is enables, it means to get rid of toc and tof contents.
     # get rid of TOC and TOF sections and their title
     if ($self->{'checkout_toc'}){
+    #if (defined $self->{'checkout_toc'}){
     #line-height:150%;mso-ansi-language:FR'>Contents<o:p></o:p></span></b></p>
     # get rid of Table of Contents title and Table of Figures
     #$body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Content.|Content.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
     #$body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Figure.|Figure.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
     $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
     $body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
+    }
     if ($self->{'title_header'}){
+    #$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
+    #$body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
+    #}
+    if (defined $self->{'title_header'}){
     $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
     $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
+    }
     if ($self->{'level1_header'}){
+    if (defined $self->{'level1_header'}){
     $self->{'level1_header'} =~ s/^(\()(.*)(\))/$2/is;
     $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
+    }
     if ($self->{'level2_header'}){
+    if (defined $self->{'level2_header'}){
     $self->{'level2_header'} =~ s/^(\()(.*)(\))/$2/is;
     $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
+    }
     if ($self->{'level3_header'}){
+    if (defined $self->{'level3_header'}){
     $self->{'level3_header'} =~ s/^(\()(.*)(\))/$2/is;
     $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
+    }
     # Tidy up extra new lines
     $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
     $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
     my $body = "<body".$body_text;
     my $section_text = $head;
     $section_text .= "<!--\n<Section>\n-->\n";
     # split HTML text on <h1>, <h2> etc tags
     my @h_split = split(/<h/i,$body);
     my $hnum = 0;
     my $sectionh1 = 0;
     $section_text .= shift(@h_split);
     my $hc;
     foreach $hc ( @h_split )
 …
         $h_text =~ s/^\s$//s;
         $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
         if ($h_text =~ m/\w+/)
+        {
 …
             print $outhandle $spacing."$h_text\n"
             if $self->{'verbosity'} > 2;
             $sectionh1++ if ($hnum==1);
+        }
 …
+        }
-        # $section_text .= "<!-- \n</Section>\n-->\n";
-        #print STDERR "***HC = $hc\n";
         $section_text .= "<h$hc";
+    }
 …
     $$textref = $section_text;
 # should be textref not testref???
 #    $$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
+    # should be textref not testref???
+    #$$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
     if ($sectionh1>0)
+    {
 …
     print $outhandle "  Passing on the HTMLPlug\n"
     if $self->{'verbosity'} > 1;
     $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
     $$textref =~ s/(&nbsp;)+/&nbsp;/sg;
 ##    $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
+    ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
     $self->SUPER::process(@_);
     # associate original file with doc object
     my $cursection = $doc_obj->get_top_section();
 …
     $doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
     my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
     $doc_obj->add_utf8_metadata ($cursection, "srclink",  $doclink);
 …
+{
     my ($self,$front,$back,$base_dir,$href) = @_;
     # dig out width and height of image, if there
     my $img_attributes = "$front back";
     my ($img_width)  = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
     my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
     # derive local filename for image based on its URL
     my $img_filename = $href;
     $img_filename =~ s/^[^:]*:\/\///;
     $img_filename = &util::filename_cat($base_dir, $img_filename);
     # Replace %20's in URL with a space if required. Note that the filename
     # may include the %20 in some situations
 …
     if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
     # get image info on width and height
     my $outhandle = $self->{'outhandle'};
     my $verbosity = $self->{'verbosity'};
 …
     my ($image_type, $actual_width, $actual_height, $image_size)
         = &ImagePlug::identify($img_filename, $outhandle, $verbosity);
     #print STDERR "**** $actual_width x $actual_height";
     #print STDERR " (requested: $img_width x $img_height)\n";
 …
     if (($img_width < $actual_width) || ($img_height < $actual_height)) {
         print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
         # derive new image name based on current image
         my ($tailname, $dirname, $suffix)
         = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
         my $resized_filename
         = &util::filename_cat($dirname, $tailname."_resized".$suffix);
         #print STDERR "**** suffix = $suffix\n";
         # Generate smaller image with convert
         my $newsize = "$img_widthx$image_height";
 …
         my $result = '';
         print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
+    }
+    }
+    }
+    }
     return $href;
+}
 sub replace_images {
 …
     $back="\"$back";
+    }
     $link =~ s/\n/ /g;
     my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
 ##    $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
     my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
     return $front . $middle . $back;
+}
+sub extract_metadata
+{
+    my $self = shift (@_);
+    my ($textref, $metadata, $doc_obj) = @_;
+    my $outhandle = $self->{'outhandle'};
+    # metadata fields to extract/save. 'key' is the (lowercase) name of the
+    # html meta, 'value' is the metadata name for greenstone to use
+    my %find_fields = ();
+    my ($tag,$value);
+    my $orig_field = "";
+    foreach my $field (split /,/, $self->{'extracted_word_metadata_fields'}) {
+    # support tag<tagname>
+    if ($field =~ /^(.*?)<(.*?)>$/) {
+        # "$2" is the user's preferred gs metadata name
+        $find_fields{lc($1)}=$2; # lc = lowercase
+        $orig_field = $1;
+    } else { # no <tagname> for mapping
+        # "$field" is the user's preferred gs metadata name
+        $find_fields{lc($field)}=$field; # lc = lowercase
+        $orig_field = $field;
+    }
+    if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
+        $tag = $orig_field;
+        $value = $1;
+        if (!defined $value || !defined $tag){
+        print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
+        next;
+        } else {
+        # clean up and add
+        chomp($value); # remove trailing \n, if any
+        $tag = $find_fields{lc($tag)};
+        print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
+            if ($self->{'verbosity'} > 2);
+        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
+        }
+    }
+    }
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 10426

Legend:

trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm

Download in other formats: