Changeset 10426


Ignore:
Timestamp:
2005-08-05T15:16:47+12:00 (19 years ago)
Author:
chi
Message:

Add an option -extracted_word_metadata to extract metadata based on user-defined fields from HTML (converted by
VB Scripting) document

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/StructuredHTMLPlug.pm

    r10404 r10426  
    6363   
    6464    return bless $self, $class;
    65 
    6665}
    6766
     
    9392    print $outhandle "StructuredHTMLPlug: processing $file\n"
    9493        if $self->{'verbosity'} > 1;
    95 
    9694    my @head_and_body = split(/<body/i,$$textref);
    9795    my $head = shift(@head_and_body);
    9896    my $body_text = join("<body", @head_and_body);
    9997   
     98    if (defined $self->{'extracted_word_metadata_fields'}) {
     99    my @doc_properties = split(/<xml>/i,$head);
     100    my $doc_heading = shift(@doc_properties);
     101    my $rest_doc_properties = join(" ", @doc_properties);
     102    my @extracted_metadata = split(/<\/xml>/i, $rest_doc_properties);
     103    my $extracted_metadata = shift (@extracted_metadata);
     104    $self->extract_metadata($extracted_metadata, $metadata, $doc_obj);
     105    }
     106   
    100107    # If checkout_toc is enables, it means to get rid of toc and tof contents.
    101108    # get rid of TOC and TOF sections and their title
    102     if ($self->{'checkout_toc'}){
     109    #if (defined $self->{'checkout_toc'}){
    103110    #line-height:150%;mso-ansi-language:FR'>Contents<o:p></o:p></span></b></p>
    104111    # get rid of Table of Contents title and Table of Figures
    105112    #$body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Content.|Content.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
    106113    #$body_text =~ s/<p[^>]*><b><span[^>]*>(Table of Figure.|Figure.)<o:p><\/o:p><\/span><\/b><\/p>//isg;
    107     $body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
    108     $body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
    109     }
    110 
    111     if ($self->{'title_header'}){
     114    #$body_text =~ s/<p class=(($self->{'toc_header'})[^>]*)>(.+?)<\/p>//isg;
     115    #$body_text =~ s/<p class=(($self->{'tof_header'})[^>]*)>(.+?)<\/p>//isg;
     116    #}
     117
     118    if (defined $self->{'title_header'}){
    112119    $self->{'title_header'} =~ s/^(\()(.*)(\))/$2/is;
    113120    $body_text =~ s/<p class=(($self->{'title_header'})[^>]*)>(.+?)<\/p>/<p class=$1><title>$3<\/title><\/p>/isg;
    114121    }
    115     if ($self->{'level1_header'}){
     122    if (defined $self->{'level1_header'}){
    116123    $self->{'level1_header'} =~ s/^(\()(.*)(\))/$2/is;
    117124    $body_text =~ s/<p class=(($self->{'level1_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h1>$3<\/h1><\/p>/isg;
    118125    }
    119     if ($self->{'level2_header'}){
     126    if (defined $self->{'level2_header'}){
    120127    $self->{'level2_header'} =~ s/^(\()(.*)(\))/$2/is;
    121128    $body_text =~ s/<p class=(($self->{'level2_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h2>$3<\/h2><\/p>/isg;
    122129    }
    123130   
    124     if ($self->{'level3_header'}){
     131    if (defined $self->{'level3_header'}){
    125132    $self->{'level3_header'} =~ s/^(\()(.*)(\))/$2/is;
    126133    $body_text =~ s/<p class=(($self->{'level3_header'})[^>]*)>(.+?)<\/p>/<p class=$1><h3>$3<\/h3><\/p>/isg;
    127134    }
    128        
    129135    # Tidy up extra new lines
    130136    $body_text =~ s/(<p[^>]*><span[^>]*><o:p>&nbsp;<\/o:p><\/span><\/p>)//isg;
    131137    $body_text =~ s/(<p[^>]*><o:p>&nbsp;<\/o:p><\/p>)//isg;
    132 
     138   
    133139    my $body = "<body".$body_text;
    134140   
    135141    my $section_text = $head;
    136142    $section_text .= "<!--\n<Section>\n-->\n";
    137 
     143   
    138144    # split HTML text on <h1>, <h2> etc tags
    139145    my @h_split = split(/<h/i,$body);
    140  
     146    
    141147    my $hnum = 0;
    142148
    143149    my $sectionh1 = 0;
    144150    $section_text .= shift(@h_split);
    145 
     151   
    146152    my $hc;
    147153    foreach $hc ( @h_split )
     
    162168        $h_text =~ s/^\s$//s;
    163169        $h_text =~ s/(&nbsp;)+\W*/&nbsp;/sg;
    164 
     170       
    165171        if ($h_text =~ m/\w+/)
    166172        {
     
    202208            print $outhandle $spacing."$h_text\n"
    203209            if $self->{'verbosity'} > 2;
    204 
     210           
    205211            $sectionh1++ if ($hnum==1);
    206212        }
     
    210216       
    211217        }
    212         # $section_text .= "<!-- \n</Section>\n-->\n";
    213         #print STDERR "***HC = $hc\n";
    214218        $section_text .= "<h$hc";
    215219    }
     
    232236
    233237    $$textref = $section_text;
    234 
    235 # should be textref not testref???
    236 #    $$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
    237 
     238   
     239    # should be textref not testref???
     240    #$$testref =~ s/<h(\d+)>(.*?)<\/h$1>/<Section><Metadata name=\"Title\">$1<\/Metadata></Section><h$1><\/h$1>/gi;
     241   
    238242    if ($sectionh1>0)
    239243    {
     
    243247    print $outhandle "  Passing on the HTMLPlug\n"
    244248    if $self->{'verbosity'} > 1;
    245 
     249   
    246250    $$textref =~ s/<!\[if !vml\]>/<![if vml]>/g;
    247 
     251   
    248252    $$textref =~ s/(&nbsp;)+/&nbsp;/sg;   
    249 
    250 ##    $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
     253   
     254    ## $$textref =~ s/<o:p>&nbsp;<\/o:p>//g; # used with VML to space figures?
    251255   
    252256    $self->SUPER::process(@_);
    253 
     257   
    254258    # associate original file with doc object
    255259    my $cursection = $doc_obj->get_top_section();
     
    261265   
    262266    $doc_obj->associate_file($filename, "doc.doc", undef, $cursection);
    263 
     267   
    264268    my $doclink = "<a href=_httpcollection_/index/assoc/[archivedir]/doc.doc>";
    265269    $doc_obj->add_utf8_metadata ($cursection, "srclink",  $doclink);
     
    289293{
    290294    my ($self,$front,$back,$base_dir,$href) = @_;
    291 
     295   
    292296    # dig out width and height of image, if there
    293297    my $img_attributes = "$front back";
    294298    my ($img_width)  = ($img_attributes =~ m/\s+width=\"?(\d+)\"?/i);
    295299    my ($img_height) = ($img_attributes =~ m/\s+height=\"?(\d+)\"?/i);
    296 
     300   
    297301    # derive local filename for image based on its URL
    298302    my $img_filename = $href;
    299303    $img_filename =~ s/^[^:]*:\/\///;
    300304    $img_filename = &util::filename_cat($base_dir, $img_filename);
    301        
     305   
    302306    # Replace %20's in URL with a space if required. Note that the filename
    303307    # may include the %20 in some situations
     
    309313    if ((-e $img_filename) && (defined $img_width) && (defined $img_height)) {
    310314    # get image info on width and height
    311 
     315   
    312316    my $outhandle = $self->{'outhandle'};
    313317    my $verbosity = $self->{'verbosity'};
     
    315319    my ($image_type, $actual_width, $actual_height, $image_size)
    316320        = &ImagePlug::identify($img_filename, $outhandle, $verbosity);
    317 
     321   
    318322    #print STDERR "**** $actual_width x $actual_height";
    319323    #print STDERR " (requested: $img_width x $img_height)\n";
     
    321325    if (($img_width < $actual_width) || ($img_height < $actual_height)) {
    322326        print $outhandle "Resizing $img_filename\n" if ($verbosity > 0);
    323 
     327       
    324328        # derive new image name based on current image
    325329        my ($tailname, $dirname, $suffix)
    326330        = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
    327 
     331       
    328332        my $resized_filename
    329333        = &util::filename_cat($dirname, $tailname."_resized".$suffix);
    330 
     334       
    331335        #print STDERR "**** suffix = $suffix\n";
    332 
     336       
    333337        # Generate smaller image with convert
    334338        my $newsize = "$img_widthx$image_height";
     
    338342        my $result = '';
    339343        print $outhandle "ImageResize result: $result\n" if ($verbosity > 2);
    340 
    341     }
    342     }
    343 
     344    }
     345    }
    344346    return $href;
    345347}
    346 
    347 
    348 
    349348
    350349sub replace_images {
     
    358357    $back="\"$back";
    359358    }
    360 
     359   
    361360    $link =~ s/\n/ /g;
    362 
     361   
    363362    my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
    364 
     363   
    365364##    $href = $self->resize_if_necessary($front,$back,$base_dir,$href);
    366 
     365   
    367366    my $middle = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
    368 
     367   
    369368    return $front . $middle . $back;
    370369}
    371370
     371sub extract_metadata
     372{
     373    my $self = shift (@_);
     374    my ($textref, $metadata, $doc_obj) = @_;
     375    my $outhandle = $self->{'outhandle'};
     376   
     377    # metadata fields to extract/save. 'key' is the (lowercase) name of the
     378    # html meta, 'value' is the metadata name for greenstone to use
     379    my %find_fields = ();
     380    my ($tag,$value);
     381
     382    my $orig_field = "";
     383    foreach my $field (split /,/, $self->{'extracted_word_metadata_fields'}) {
     384    # support tag<tagname>
     385    if ($field =~ /^(.*?)<(.*?)>$/) {
     386        # "$2" is the user's preferred gs metadata name
     387        $find_fields{lc($1)}=$2; # lc = lowercase
     388        $orig_field = $1;
     389    } else { # no <tagname> for mapping
     390        # "$field" is the user's preferred gs metadata name
     391        $find_fields{lc($field)}=$field; # lc = lowercase
     392        $orig_field = $field;
     393    }
     394    if ($textref =~ m/<o:$orig_field>(.*)<\/o:$orig_field>/i){
     395        $tag = $orig_field;
     396        $value = $1;
     397        if (!defined $value || !defined $tag){
     398        print $outhandle "StructuredHTMLPlug: can't find VALUE in \"$tag\"\n";
     399        next;
     400        } else {
     401        # clean up and add
     402        chomp($value); # remove trailing \n, if any
     403        $tag = $find_fields{lc($tag)};
     404        print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
     405            if ($self->{'verbosity'} > 2);
     406        $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), $tag, $value);
     407        }
     408    }
     409    }
     410}
    372411
    3734121;
Note: See TracChangeset for help on using the changeset viewer.